389 files changed, 18157 insertions, 10853 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e051147679..814ac4e213a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
          If unsure, say N.
+if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
 config 9P_FS_POSIX_ACL
        bool "9P POSIX Access Control Lists"
-        depends on 9P_FS
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1..ab8c1278063 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
        vfs_super.o \
        vfs_inode.o \
+        vfs_inode_dotl.o \
        vfs_addr.o \
        vfs_file.o \
        vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351db..02a2cf61631 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
        ssize_t size;
        void *value = NULL;
-        struct posix_acl *acl = NULL;;
+        struct posix_acl *acl = NULL;
        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
        if (size > 0) {
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
        return acl;
 }
-int v9fs_check_acl(struct inode *inode, int mask)
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        struct v9fs_session_info *v9ses;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
                /*
@@ -362,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
                if (!S_ISDIR(inode->i_mode)) {
-                        retval = -EINVAL;
+                        retval = acl ? -EINVAL : 0;
                        goto err_out;
                }
                break;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7..7ef3ac9f6d9 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
                               struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2..c4b5d8864f0 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                        struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                        void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000L;
 }
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f393..466d2a4fc5c 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
 *
 */
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
                                                                        dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 *
 */
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 34bf71b5654..5076eeb9550 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -237,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 *
 */
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
-#endif
-/**
+void v9fs_destroy_inode(struct inode *inode)
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-        BUG_ON(dir_inode == NULL);
-        if (dir_inode->i_mode & S_ISGID) {
-                /* set_gid bit is set.*/
-                return dir_inode->i_gid;
-        }
-        return current_fsgid();
-}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
-        struct dentry *dentry;
+        call_rcu(&inode->i_rcu, v9fs_i_callback);
-        spin_lock(&dcache_lock);
-        /* Directory should have only one entry. */
-        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&dcache_lock);
-        return dentry;
 }
+#endif
 /**
 * v9fs_get_inode - helper function to setup an inode
@@ -447,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
@@ -482,60 +447,6 @@ error:
        return ERR_PTR(err);
 }
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
-{
-        struct inode *ret = NULL;
-        int err;
-        struct p9_stat_dotl *st;
-        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
-        kfree(st);
-        return ERR_PTR(err);
-}
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                        struct super_block *sb)
-{
-        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
-        else
-                return v9fs_inode(v9ses, fid, sb);
-}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -626,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -650,144 +555,6 @@ error:
 }
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                struct nameidata *nd)
-{
-        int err = 0;
-        char *name = NULL;
-        gid_t gid;
-        int flags;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
-        struct file *filp;
-        struct p9_qid qid;
-        struct inode *inode;
-        struct posix_acl *pacl = NULL, *dacl = NULL;
-        v9ses = v9fs_inode2v9ses(dir);
-        if (nd && nd->flags & LOOKUP_OPEN)
-                flags = nd->intent.open.flags - 1;
-        else {
-                /*
-                 * create call without LOOKUP_OPEN is due
-                 * to mknod of regular files. So use mknod
-                 * operation.
-                 */
-                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
-        }
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, omode);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        /* clone a fid to use for creation */
-        ofid = p9_client_walk(dfid, 0, NULL, 1);
-        if (IS_ERR(ofid)) {
-                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in creat %d\n", err);
-                goto error;
-        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                                "p9_client_open_dotl failed in creat %d\n",
-                                err);
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
-            (nd && nd->flags & LOOKUP_OPEN)) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                /* The fid would get clunked via a dput */
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-        /* if we are opening a file, assign the open fid to the file */
-        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-                if (IS_ERR(filp)) {
-                        p9_client_clunk(ofid);
-                        return PTR_ERR(filp);
-                }
-                filp->private_data = ofid;
-        } else
-                p9_client_clunk(ofid);
-        return 0;
-error:
-        if (ofid)
-                p9_client_clunk(ofid);
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -877,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                               struct dentry *dentry, int omode)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        gid_t gid;
-        char *name;
-        mode_t mode;
-        struct inode *inode;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-        err = 0;
-        v9ses = v9fs_inode2v9ses(dir);
-        omode |= S_IFDIR;
-        if (dir->i_mode & S_ISGID)
-                omode |= S_ISGID;
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mkdir %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -986,7 +652,7 @@ error:
 *
 */
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
        struct super_block *sb;
@@ -1034,9 +700,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 inst_out:
        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
        else
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
@@ -1056,7 +722,7 @@ error:
 *
 */
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 0);
 }
@@ -1068,7 +734,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 *
 */
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 1);
 }
@@ -1082,7 +748,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 *
 */
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1189,42 +855,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-                 struct kstat *stat)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-                return simple_getattr(mnt, dentry, stat);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        /* Ask for all the fields in stat structure. Server will return
-         * whatever it supports
-         */
-        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-        if (IS_ERR(st))
-                return PTR_ERR(st);
-        v9fs_stat2inode_dotl(st, dentry->d_inode);
-        generic_fillattr(dentry->d_inode, stat);
-        /* Change block size to what the server returned */
-        stat->blksize = st->st_blksize;
-        kfree(st);
-        return 0;
-}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -1284,64 +914,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
-        retval = inode_change_ok(dentry->d_inode, iattr);
-        if (retval)
-                return retval;
-        p9attr.valid = iattr->ia_valid;
-        p9attr.mode = iattr->ia_mode;
-        p9attr.uid = iattr->ia_uid;
-        p9attr.gid = iattr->ia_gid;
-        p9attr.size = iattr->ia_size;
-        p9attr.atime_sec = iattr->ia_atime.tv_sec;
-        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
-        setattr_copy(dentry->d_inode, iattr);
-        mark_inode_dirty(dentry->d_inode);
-        if (iattr->ia_valid & ATTR_MODE) {
-                /* We also want to update ACL when we update mode bits */
-                retval = v9fs_acl_chmod(dentry);
-                if (retval < 0)
-                        return retval;
-        }
-        return 0;
-}
-/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -1419,77 +991,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-                inode->i_atime.tv_sec = stat->st_atime_sec;
-                inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                inode->i_uid = stat->st_uid;
-                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                i_size_write(inode, stat->st_size);
-                inode->i_blocks = stat->st_blocks;
-        } else {
-                if (stat->st_result_mask & P9_STATS_ATIME) {
-                        inode->i_atime.tv_sec = stat->st_atime_sec;
-                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_MTIME) {
-                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_CTIME) {
-                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_UID)
-                        inode->i_uid = stat->st_uid;
-                if (stat->st_result_mask & P9_STATS_GID)
-                        inode->i_gid = stat->st_gid;
-                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
-                if (stat->st_result_mask & P9_STATS_MODE) {
-                        inode->i_mode = stat->st_mode;
-                        if ((S_ISBLK(inode->i_mode)) ||
-                                                (S_ISCHR(inode->i_mode)))
-                                init_special_inode(inode, inode->i_mode,
-                                                                inode->i_rdev);
-                }
-                if (stat->st_result_mask & P9_STATS_RDEV)
-                        inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if (stat->st_result_mask & P9_STATS_SIZE)
-                        i_size_write(inode, stat->st_size);
-                if (stat->st_result_mask & P9_STATS_BLOCKS)
-                        inode->i_blocks = stat->st_blocks;
-        }
-        if (stat->st_result_mask & P9_STATS_GEN)
-                        inode->i_generation = stat->st_gen;
-        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-         * because the inode structure does not have fields for them.
-         */
-}
-/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1595,7 +1096,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 *
 */
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
@@ -1639,94 +1140,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-                const char *symname)
-{
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
-        int err;
-        gid_t gid;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-                        dir->i_ino, name, symname);
-        v9ses = v9fs_inode2v9ses(dir);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache) {
-                /* Now walk from the parent so we can get an unopened fid. */
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
-                        fid = NULL;
-                        goto error;
-                }
-                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1785,77 +1198,6 @@ clunk_fid:
 }
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-                struct dentry *dentry)
-{
-        int err;
-        struct p9_fid *dfid, *oldfid;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
-                        dentry->d_name.name);
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid))
-                return PTR_ERR(dfid);
-        oldfid = v9fs_fid_lookup(old_dentry);
-        if (IS_ERR(oldfid))
-                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
-        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-                return err;
-        }
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                /* Get the latest stat info from server. */
-                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
-                fid = v9fs_fid_lookup(old_dentry);
-                if (IS_ERR(fid))
-                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
-        }
-        dentry->d_op = old_dentry->d_op;
-        d_instantiate(dentry, old_dentry->d_inode);
-        return err;
-}
-/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1900,160 +1242,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                dev_t rdev)
-{
-        int err;
-        char *name;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        struct inode *inode;
-        gid_t gid;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS,
-                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
-        if (!new_valid_dev(rdev))
-                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mknod %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate inode with stat.
-                 * socket syscall returns a fd, so we need instantiate
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-static int
-v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
-{
-        int retval;
-        struct p9_fid *fid;
-        char *target = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
-        retval = -EPERM;
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_readlink(fid, &target);
-        if (retval < 0)
-                return retval;
-        strncpy(buffer, target, buflen);
-        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
-        retval = strnlen(buffer, buflen);
-        return retval;
-}
-/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
-{
-        int len = 0;
-        char *link = __getname();
-        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
-        if (!link)
-                link = ERR_PTR(-ENOMEM);
-        else {
-                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else
-                        link[min(len, PATH_MAX-1)] = 0;
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2068,25 +1256,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create_dotl,
-        .lookup = v9fs_vfs_lookup,
-        .link = v9fs_vfs_link_dotl,
-        .symlink = v9fs_vfs_symlink_dotl,
-        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir_dotl,
-        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
-        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2104,16 +1273,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -2122,14 +1281,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link_dotl,
-        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 00000000000..fe3ffa9aace
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&inode->i_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&inode->i_lock);
+        return dentry;
+}
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else {
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, omode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+        /* Since we are opening a file, assign the open fid to the file */
+        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+        if (IS_ERR(filp)) {
+                p9_client_clunk(ofid);
+                return PTR_ERR(filp);
+        }
+        filp->private_data = ofid;
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+                               struct dentry *dentry, int omode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        mode_t mode;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
+        return 0;
+}
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
+                 */
+                ihold(old_dentry->d_inode);
+        }
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *link = __getname();
+        char *target;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto ndset;
+        }
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                __putname(link);
+                link = ERR_PTR(PTR_ERR(fid));
+                goto ndset;
+        }
+        retval = p9_client_readlink(fid, &target);
+        if (!retval) {
+                strcpy(link, target);
+                kfree(target);
+                goto ndset;
+        }
+        __putname(link);
+        link = ERR_PTR(retval);
+ndset:
+        nd_set_link(nd, link);
+        return NULL;
+}
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link_dotl,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df8433..d288773871b 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
                        "p9_client_xattrcreate failed %d\n", retval);
                goto error;
        }
-        msize = fid->clnt->msize;;
+        msize = fid->clnt->msize;
        while (value_len) {
                if (value_len > (msize - P9_IOHDRSZ))
                        write_count = msize - P9_IOHDRSZ;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de74..bf7693c384f 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+                struct qstr *qstr)
 {
        const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
        const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
 * requirements of the underlying filesystem.
 */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i;
-        if (entry->len != name->len)
+        if (len != name->len)
                return 1;
        for (i = 0; i < name->len; i++) {
                char a, b;
-                a = entry->name[i];
+                a = str[i];
                b = name->name[i];
                if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,7 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        dentry->d_op = &adfs_dentry_operations; 
+        d_set_d_op(dentry, &adfs_dentry_operations);
        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42..a4041b52fbc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
+static void adfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, adfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -477,7 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                adfs_error(sb, "get root inode failed\n");
                goto error;
        } else
-                sb->s_root->d_op = &adfs_dentry_operations;
+                d_set_d_op(sb->s_root, &adfs_dentry_operations);
        unlock_kernel();
        return 0;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a..3a4557e8325 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
        void *data = dentry->d_fsdata;
        struct list_head *head, *next;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        head = &inode->i_dentry;
        next = head->next;
        while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07..944a4042fb6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,11 +13,19 @@
 typedef int (*toupper_t)(int);
 static int       affs_toupper(int ch);
-static int       affs_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_hash_dentry(const struct dentry *,
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 static int       affs_intl_toupper(int ch);
-static int       affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_intl_hash_dentry(const struct dentry *,
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 const struct dentry_operations affs_dentry_operations = {
        .d_hash         = affs_hash_dentry,
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
 * Note: the dentry argument is the parent dentry.
 */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
        int i;
-        i = affs_check_name(qstr->name,qstr->len);
+        i = affs_check_name(qstr->name, qstr->len);
        if (i)
                return i;
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_toupper);
+        return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+        return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
-static inline int
+static inline int __affs_compare_dentry(unsigned int len,
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+                const char *str, const struct qstr *name, toupper_t toupper)
 {
-        const u8 *aname = a->name;
+        const u8 *aname = str;
-        const u8 *bname = b->name;
+        const u8 *bname = name->name;
-        int len;
-        /* 'a' is the qstr of an already existing dentry, so the name
+        /*
-         * must be valid. 'b' must be validated first.
+         * 'str' is the name of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (affs_check_name(b->name,b->len))
+        if (affs_check_name(name->name, name->len))
                return 1;
-        /* If the names are longer than the allowed 30 chars,
+        /*
+         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        len = a->len;
        if (len >= 30) {
-                if (b->len < 30)
+                if (name->len < 30)
                        return 1;
                len = 30;
-        } else if (len != b->len)
+        } else if (len != name->len)
                return 1;
        for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_toupper);
+        return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 /*
@@ -226,7 +240,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
-        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
+        d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cb..d39081bbe7c 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
        return &i->vfs_inode;
 }
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
+static void affs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, affs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -475,7 +482,7 @@ got_root:
                printk(KERN_ERR "AFFS: Get root inode failed\n");
                goto out_error;
        }
-        sb->s_root->d_op = &affs_dentry_operations;
+        d_set_d_op(sb->s_root, &affs_dentry_operations);
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a8..34a3263d60a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
@@ -581,7 +582,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        }
 success:
-        dentry->d_op = &afs_fs_dentry_operations;
+        d_set_d_op(dentry, &afs_fs_dentry_operations);
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
@@ -607,6 +608,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
        void *dir_version;
        int ret;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        vnode = AFS_FS_I(dentry->d_inode);
        if (dentry->d_inode)
@@ -730,7 +734,7 @@ out_bad:
 * - called from dput() when d_count is going to 0.
 * - return 1 to request dentry be unhashed, 0 otherwise
 */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
        _enter("%s", dentry->d_name.name);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736f..6d4bc1c8ff6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -624,7 +624,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 /*
 * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e..f44b9d35537 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 * - AFS ACLs are attached to directories only, and a file is controlled by its
 *   parent directory's ACL
 */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_access_t uninitialized_var(access);
        struct key *key;
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        _enter("{{%x:%u},%lx},%x,",
               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
        }
        key_put(key);
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, flags, NULL);
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece..f901a9d7c11 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -498,6 +498,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        return &vnode->vfs_inode;
 }
+static void afs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(afs_inode_cachep, vnode);
+}
 /*
 * destroy an AFS inode struct
 */
@@ -511,7 +519,7 @@ static void afs_destroy_inode(struct inode *inode)
        ASSERTCMP(vnode->server, ==, NULL);
-        kmem_cache_free(afs_inode_cachep, vnode);
+        call_rcu(&inode->i_rcu, afs_i_callback);
        atomic_dec(&afs_count_active_inodes);
 }
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d7..5e00f15c54a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
        queue_delayed_work(aio_wq, &ctx->wq, timeout);
 }
-/*
- * aio_run_iocbs:
- *      Process all pending retries queued on the ioctx
- *      run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
-        int requeue;
-        spin_lock_irq(&ctx->ctx_lock);
-        requeue = __aio_run_iocbs(ctx);
-        spin_unlock_irq(&ctx->ctx_lock);
-        if (requeue)
-                aio_queue_work(ctx);
-}
 /*
- * just like aio_run_iocbs, but keeps running them until
+ * aio_run_all_iocbs:
- * the list stays empty
+ *      Process all pending retries queued on the ioctx
+ *      run list, and keep running them until the list
+ *      stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
 */
 static inline void aio_run_all_iocbs(struct kioctx *ctx)
 {
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
        long ret = -EINVAL;
        if (likely(ioctx)) {
-                if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+                if (likely(min_nr <= nr && min_nr >= 0))
                        ret = read_events(ioctx, min_nr, nr, events, timeout);
                put_ioctx(ioctx);
        }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 73097336ea2..98edb657b84 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -102,7 +102,7 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
        if (!path.dentry)
                goto err_module;
@@ -113,7 +113,7 @@ struct file *anon_inode_getfile(const char *name,
         */
        ihold(anon_inode_inode);
-        path.dentry->d_op = &anon_inodefs_dentry_operations;
+        d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
        d_instantiate(path.dentry, anon_inode_inode);
        error = -ENFILE;
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
        return 0;
 err_mntput:
-        mntput(anon_inode_mnt);
+        mntput_long(anon_inode_mnt);
 err_unregister_filesystem:
        unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d..0fffe1c24ce 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
+extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
@@ -254,17 +257,15 @@ static inline int simple_positive(struct dentry *dentry)
        return dentry->d_inode && !d_unhashed(dentry);
 }
-static inline int __simple_empty(struct dentry *dentry)
+static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
-        struct dentry *child;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        int ret = 0;
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+                if (list_empty(&ino->expiring))
-                if (simple_positive(child))
+                        list_add(&ino->expiring, &sbi->expiring_list);
-                        goto out;
+        }
-        ret = 1;
+        return;
-out:
-        return ret;
 }
 static inline void autofs4_add_expiring(struct dentry *dentry)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb..cc1d0136590 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -91,24 +91,64 @@ done:
 }
 /*
- * Calculate next entry in top down tree traversal.
+ * Calculate and dget next entry in top down tree traversal.
- * From next_mnt in namespace.c - elegant.
 */
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+                                                struct dentry *root)
 {
-        struct list_head *next = p->d_subdirs.next;
+        struct list_head *next;
+        struct dentry *p, *ret;
+        if (prev == NULL)
+                return dget(prev);
+        spin_lock(&autofs4_lock);
+relock:
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_subdirs.next;
        if (next == &p->d_subdirs) {
                while (1) {
-                        if (p == root)
+                        struct dentry *parent;
+                        if (p == root) {
+                                spin_unlock(&p->d_lock);
+                                spin_unlock(&autofs4_lock);
+                                dput(prev);
                                return NULL;
+                        }
+                        parent = p->d_parent;
+                        if (!spin_trylock(&parent->d_lock)) {
+                                spin_unlock(&p->d_lock);
+                                cpu_relax();
+                                goto relock;
+                        }
+                        spin_unlock(&p->d_lock);
                        next = p->d_u.d_child.next;
-                        if (next != &p->d_parent->d_subdirs)
+                        p = parent;
+                        if (next != &parent->d_subdirs)
                                break;
-                        p = p->d_parent;
                }
        }
-        return list_entry(next, struct dentry, d_u.d_child);
+        ret = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(ret)) {
+                spin_unlock(&ret->d_lock);
+                p = ret;
+                goto again;
+        }
+        dget_dlock(ret);
+        spin_unlock(&ret->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&autofs4_lock);
+        dput(prev);
+        return ret;
 }
 /*
@@ -158,18 +198,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        if (!simple_positive(top))
                return 1;
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = top; p; p = next_dentry(p, top)) {
+        while ((p = get_next_positive_dentry(p, top))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                /*
                 * Is someone visiting anywhere in the subtree ?
                 * If there's no mount we need to check the usage
@@ -198,16 +231,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
-                        if (atomic_read(&p->d_count) > ino_count) {
+                        if (p->d_count > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
                        }
                }
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        /* Timeout of a tree mount is ultimately determined by its top dentry */
        if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +256,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
        DPRINTK("parent %p %.*s",
                parent, (int)parent->d_name.len, parent->d_name.name);
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = parent; p; p = next_dentry(p, parent)) {
+        while ((p = get_next_positive_dentry(p, parent))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                if (d_mountpoint(p)) {
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, p))
-                                goto cont;
+                                continue;
                        /* Can we expire this guy */
                        if (autofs4_can_expire(p, timeout, do_now))
                                return p;
                }
-cont:
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 }
@@ -276,7 +295,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
                struct autofs_info *ino = autofs4_dentry_ino(root);
                if (d_mountpoint(root)) {
                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
-                        root->d_mounted--;
+                        spin_lock(&root->d_lock);
+                        root->d_flags &= ~DCACHE_MOUNTED;
+                        spin_unlock(&root->d_lock);
                }
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
@@ -302,8 +323,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 {
        unsigned long timeout;
        struct dentry *root = sb->s_root;
+        struct dentry *dentry;
        struct dentry *expired = NULL;
-        struct list_head *next;
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
        struct autofs_info *ino;
@@ -315,23 +336,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        now = jiffies;
        timeout = sbi->exp_timeout;
-        spin_lock(&dcache_lock);
+        dentry = NULL;
-        next = root->d_subdirs.next;
+        while ((dentry = get_next_positive_dentry(dentry, root))) {
-        /* On exit from the loop expire is set to a dgot dentry
-         * to expire or it's NULL */
-        while ( next != &root->d_subdirs ) {
-                struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-                /* Negative dentry - give up */
-                if (!simple_positive(dentry)) {
-                        next = next->next;
-                        continue;
-                }
-                dentry = dget(dentry);
-                spin_unlock(&dcache_lock);
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
@@ -347,7 +353,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 2;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        /* Can we umount this guy */
@@ -369,7 +375,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +389,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -394,11 +400,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                }
 next:
                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-                next = next->next;
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 found:
@@ -408,9 +410,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&expired->d_parent->d_lock);
+        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&expired->d_lock);
+        spin_unlock(&expired->d_parent->d_lock);
+        spin_unlock(&autofs4_lock);
        return expired;
 }
@@ -499,7 +505,14 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        sb->s_root->d_mounted++;
+                        spin_lock(&sb->s_root->d_lock);
+                        /*
+                         * If we haven't been expired away, then reset
+                         * mounted status.
+                         */
+                        if (mnt->mnt_parent != mnt)
+                                sb->s_root->d_flags |= DCACHE_MOUNTED;
+                        spin_unlock(&sb->s_root->d_lock);
                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac87e49fa70..a7bdb9dcac8 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -309,7 +309,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_iput;
        pipe = NULL;
-        root->d_op = &autofs4_sb_dentry_operations;
+        d_set_d_op(root, &autofs4_sb_dentry_operations);
        root->d_fsdata = ino;
        /* Can this call block? */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d34896cfb19..651e4ef563b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
 #include "autofs_i.h"
+DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -142,12 +144,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOENT;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
 out:
        return dcache_dir_open(inode, file);
@@ -252,9 +257,11 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* We trigger a mount for almost all flags */
        lookup_type = autofs4_need_mount(nd->flags);
        spin_lock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                spin_unlock(&sbi->fs_lock);
                goto follow;
        }
@@ -266,7 +273,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
         */
        if (ino->flags & AUTOFS_INF_PENDING ||
            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                spin_unlock(&sbi->fs_lock);
                status = try_to_fill_dentry(dentry, nd->flags);
@@ -275,7 +283,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto follow;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
        spin_unlock(&sbi->fs_lock);
 follow:
        /*
@@ -306,12 +315,19 @@ out_error:
 */
 static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
-        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+        struct autofs_sb_info *sbi;
-        int oz_mode = autofs4_oz_mode(sbi);
+        int oz_mode;
        int flags = nd ? nd->flags : 0;
        int status = 1;
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
+        sbi = autofs4_sbi(dir->i_sb);
+        oz_mode = autofs4_oz_mode(sbi);
        /* Pending dentry */
        spin_lock(&sbi->fs_lock);
        if (autofs4_ispending(dentry)) {
@@ -346,12 +362,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                return 0;
        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (S_ISDIR(dentry->d_inode->i_mode) &&
            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                /* The daemon never causes a mount to trigger */
                if (oz_mode)
@@ -367,7 +385,8 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                return status;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
        return 1;
 }
@@ -422,7 +441,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -436,7 +455,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&active->d_count) == 0)
+                if (active->d_count == 0)
                        goto next;
                qstr = &active->d_name;
@@ -452,17 +471,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        goto next;
                if (d_unhashed(active)) {
-                        dget(active);
+                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -477,7 +496,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -507,17 +526,17 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        goto next;
                if (d_unhashed(expiring)) {
-                        dget(expiring);
+                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -559,7 +578,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                 * we check for the hashed dentry and return the newly
                 * hashed dentry.
                 */
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
                /*
                 * And we need to ensure that the same dentry is used for
@@ -698,9 +717,9 @@ static int autofs4_dir_symlink(struct inode *dir,
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
        else
-                dentry->d_op = &autofs4_dentry_operations;
+                d_set_d_op(dentry, &autofs4_dentry_operations);
        dentry->d_fsdata = ino;
        ino->dentry = dget(dentry);
@@ -753,12 +772,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return 0;
 }
@@ -775,16 +794,20 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
+        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -829,9 +852,9 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
        else
-                dentry->d_op = &autofs4_dentry_operations;
+                d_set_d_op(dentry, &autofs4_dentry_operations);
        dentry->d_fsdata = ino;
        ino->dentry = dget(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f..c5f8459c905 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
        struct dentry *root = sbi->sb->s_root;
        struct dentry *tmp;
-        char *buf = *name;
+        char *buf;
        char *p;
-        int len = 0;
+        int len;
+        unsigned seq;
-        spin_lock(&dcache_lock);
+rename_retry:
+        buf = *name;
+        len = 0;
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
+        spin_lock(&autofs4_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&autofs4_lock);
+                rcu_read_unlock();
+                if (read_seqretry(&rename_lock, seq))
+                        goto rename_retry;
                return 0;
        }
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        return len;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aadde..9ad2369d9e3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
        return -EIO;
 }
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return -EIO;
 }
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d0..27223878ba9 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
 {
        befs_data_stream data;
        int i;
        for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
-                data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+                data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
-        data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
+        data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
-        data.indirect = fsrun_to_cpu(sb, n.indirect);
+        data.indirect = fsrun_to_cpu(sb, n->indirect);
-        data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
+        data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
-        data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+        data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
        data.max_double_indirect_range = fs64_to_cpu(sb,
-                                                     n.
+                                                     n->
                                                     max_double_indirect_range);
-        data.size = fs64_to_cpu(sb, n.size);
+        data.size = fs64_to_cpu(sb, n->size);
        return data;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c..b1d0c794747 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void
+static void befs_i_callback(struct rcu_head *head)
-befs_destroy_inode(struct inode *inode)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
+static void befs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, befs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                int num_blks;
                befs_ino->i_data.ds =
-                    fsds_to_cpu(sb, raw_inode->data.datastream);
+                    fsds_to_cpu(sb, &raw_inode->data.datastream);
                num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
                inode->i_blocks =
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49b..a8e37f81d09 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
+static void bfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bfs_inode_info *bi = foo;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c..d5b640ba6cb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 static struct linux_binfmt elf_format = {
-                .module         = THIS_MODULE,
+        .module         = THIS_MODULE,
-                .load_binary    = load_elf_binary,
+        .load_binary    = load_elf_binary,
-                .load_shlib     = load_elf_library,
+        .load_shlib     = load_elf_library,
-                .core_dump      = elf_core_dump,
+        .core_dump      = elf_core_dump,
-                .min_coredump   = ELF_EXEC_PAGESIZE,
+        .min_coredump   = ELF_EXEC_PAGESIZE,
-                .hasvdso        = 1
 };
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        return 0;
 }
-#ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
                struct elf_phdr *eppnt, int prot, int type,
                unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        return(map_addr);
 }
-#endif /* !elf_map */
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
        int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out;
        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-                             (char *)elf_phdata,size);
+                             (char *)elf_phdata, size);
        error = -EIO;
        if (retval != size) {
                if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out;
        if (!elf_check_arch(&loc->elf_ex))
                goto out;
-        if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
                goto out;
        /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        /* There was a PT_LOAD segment with p_memsz > p_filesz
                           before this one. Map anonymous pages, if needed,
                           and clear the area.  */
-                        retval = set_brk (elf_bss + load_bias,
+                        retval = set_brk(elf_bss + load_bias,
-                                          elf_brk + load_bias);
+                                         elf_brk + load_bias);
                        if (retval) {
                                send_sig(SIGKILL, current, 0);
                                goto out_free_dentry;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4230252fd68..771f2352701 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -409,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bdev_cachep, bdi);
 }
+static void bdev_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bdev_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bdev_inode *ei = (struct bdev_inode *) foo;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b..6ae2c8cac9d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -185,18 +185,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return ret;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl;
        int error = -EAGAIN;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        error = -ECHILD;
-        if (IS_ERR(acl))
+        } else {
-                return PTR_ERR(acl);
+                struct posix_acl *acl;
-        if (acl) {
+                acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-                error = posix_acl_permission(inode, acl, mask);
+                if (IS_ERR(acl))
-                posix_acl_release(acl);
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                }
        }
        return error;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af52f6d7a4d..a142d204b52 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2544,7 +2544,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 659f532d26a..0ccf9a8afcd 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -110,7 +110,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
+                d_set_d_op(dentry, &btrfs_dentry_operations);
        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -225,7 +225,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.offset = 0;
        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
+                d_set_d_op(dentry, &btrfs_dentry_operations);
        return dentry;
 fail:
        btrfs_free_path(path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 72f31ecb5c9..a0ff46a4789 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4084,7 +4084,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        dentry->d_op = &btrfs_dentry_operations;
+        d_set_d_op(dentry, &btrfs_dentry_operations);
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4127,7 +4127,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        return inode;
 }
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
        struct btrfs_root *root;
@@ -6495,6 +6495,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void btrfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
@@ -6564,7 +6571,7 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 int btrfs_drop_inode(struct inode *inode)
@@ -7204,11 +7211,11 @@ static int btrfs_set_page_dirty(struct page *page)
        return __set_page_dirty_nobuffers(page);
 }
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
-        return generic_permission(inode, mask, btrfs_check_acl);
+        return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/buffer.c b/fs/buffer.c
index 5930e382959..2219a76e2ca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
        struct buffer_head *evictee = NULL;
-        struct bh_lru *lru;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
+        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
-        if (lru->bhs[0] != bh) {
                struct buffer_head *bhs[BH_LRU_SIZE];
                int in;
                int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
                get_bh(bh);
                bhs[out++] = bh;
                for (in = 0; in < BH_LRU_SIZE; in++) {
-                        struct buffer_head *bh2 = lru->bhs[in];
+                        struct buffer_head *bh2 =
+                                __this_cpu_read(bh_lrus.bhs[in]);
                        if (bh2 == bh) {
                                __brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(lru->bhs, bhs, sizeof(bhs));
+                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -1313,23 +1312,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
        struct buffer_head *ret = NULL;
-        struct bh_lru *lru;
        unsigned int i;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
-                struct buffer_head *bh = lru->bhs[i];
+                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
                if (bh && bh->b_bdev == bdev &&
                                bh->b_blocknr == block && bh->b_size == size) {
                        if (i) {
                                while (i) {
-                                        lru->bhs[i] = lru->bhs[i - 1];
+                                        __this_cpu_write(bh_lrus.bhs[i],
+                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
-                                lru->bhs[0] = bh;
+                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
@@ -3203,22 +3201,23 @@ static void recalc_bh_state(void)
        int i;
        int tot = 0;
-        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
-        __get_cpu_var(bh_accounting).ratelimit = 0;
+        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
-                get_cpu_var(bh_accounting).nr++;
+                preempt_disable();
+                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
-                put_cpu_var(bh_accounting);
+                preempt_enable();
        }
        return ret;
 }
@@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        get_cpu_var(bh_accounting).nr--;
+        preempt_disable();
+        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
-        put_cpu_var(bh_accounting);
+        preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
@@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
-        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
-        put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d902948a90d..fa7ca04ee81 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -42,11 +42,11 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
-                dentry->d_op = &ceph_dentry_ops;
+                d_set_d_op(dentry, &ceph_dentry_ops);
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                dentry->d_op = &ceph_snapdir_dentry_ops;
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
        else
-                dentry->d_op = &ceph_snap_dentry_ops;
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
@@ -112,7 +112,7 @@ static int __dcache_readdir(struct file *filp,
        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
             last);
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        /* start at beginning? */
        if (filp->f_pos == 2 || last == NULL ||
@@ -136,6 +136,7 @@ more:
                        fi->at_end = 1;
                        goto out_unlock;
                }
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -145,13 +146,15 @@ more:
                     dentry->d_name.len, dentry->d_name.name, di->offset,
                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
+                spin_unlock(&dentry->d_lock);
                p = p->prev;
                dentry = list_entry(p, struct dentry, d_u.d_child);
                di = ceph_dentry(dentry);
        }
-        atomic_inc(&dentry->d_count);
+        dget_dlock(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -177,19 +180,19 @@ more:
        filp->f_pos++;
-        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
                err = -EAGAIN;
                goto out;
        }
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        p = p->prev;    /* advance to next dentry */
        goto more;
 out_unlock:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 out:
        if (last)
                dput(last);
@@ -987,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bf1286588f2..e61de4f7b99 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -368,6 +368,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        return &ci->vfs_inode;
 }
+static void ceph_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
 void ceph_destroy_inode(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -407,7 +416,7 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-        kmem_cache_free(ceph_inode_cachep, ci);
+        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
@@ -841,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di->offset = ceph_inode(inode)->i_max_offset++;
        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&dir->d_lock);
-        spin_lock(&dn->d_lock);
+        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dir->d_lock);
 }
 /*
@@ -879,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, atomic_read(&dn->d_count),
+                     dn, dn->d_count,
-                     realdn, atomic_read(&realdn->d_count),
+                     realdn, realdn->d_count,
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -1231,11 +1240,11 @@ retry_lookup:
                        goto retry_lookup;
                } else {
                        /* reorder parent's d_subdirs */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&parent->d_lock);
-                        spin_lock(&dn->d_lock);
+                        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
                        spin_unlock(&dn->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                }
                di = dn->d_fsdata;
@@ -1772,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
 * Check inode permissions.  We verify we have a valid value for
 * the AUTH cap, then call the generic handler.
 */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        int err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
        if (!err)
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
        return err;
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 38800eaa81d..a50fca1e03b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1486,7 +1486,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, atomic_read(&dentry->d_count), *base, len, path);
+             dentry, dentry->d_count, *base, len, path);
        return path;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7f01728a465..4553d8829ed 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -665,7 +665,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b9..6e99b9ddd4e 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
-int cdev_index(struct inode *inode)
-{
-        int idx;
-        struct kobject *kobj;
-        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-        if (!kobj)
-                return -1;
-        kobject_put(kobj);
-        return idx;
-}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fc..e654dfd092c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                                   void *buffer, uint16_t maxbuf)
 {
        const struct TCP_Server_Info *server = cookie_netfs_data;
-        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+        const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+        const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
        struct cifs_server_key *key = buffer;
        uint16_t key_len = sizeof(struct cifs_server_key);
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
         */
        switch (sa->sa_family) {
        case AF_INET:
-                key->family = server->addr.sockAddr.sin_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr.sin_port;
+                key->port = addr->sin_port;
-                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key->addr[0].ipv4_addr = addr->sin_addr;
                key_len += sizeof(key->addr[0].ipv4_addr);
                break;
        case AF_INET6:
-                key->family = server->addr.sockAddr6.sin6_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr6.sin6_port;
+                key->port = addr6->sin6_port;
-                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key->addr[0].ipv6_addr = addr6->sin6_addr;
                key_len += sizeof(key->addr[0].ipv6_addr);
                break;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b..ede98300a8c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        seq_printf(m, "Features: ");
+        seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        seq_printf(m, "dfs");
+        seq_printf(m, " dfs");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-        seq_printf(m, "fscache");
+        seq_printf(m, " fscache");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-        seq_printf(m, "lanman");
+        seq_printf(m, " lanman");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_POSIX
-        seq_printf(m, "posix");
+        seq_printf(m, " posix");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-        seq_printf(m, "spnego");
+        seq_printf(m, " spnego");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_XATTR
-        seq_printf(m, "xattr");
+        seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+        seq_printf(m, " acl");
 #endif
        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1..4dfba828316 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        char *description, *dp;
        size_t desc_len;
        struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* add the server address */
-        if (server->addr.sockAddr.sin_family == AF_INET)
+        if (server->dstaddr.ss_family == AF_INET)
-                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
+                sprintf(dp, "ip4=%pI4", &sa->sin_addr);
-        else if (server->addr.sockAddr.sin_family == AF_INET6)
+        else if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161a..66f3d50d067 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        return 0;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
        if (rc)
@@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        return rc;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
        if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3936aa7f2c2..5e7075d5f13 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -283,10 +283,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct cifs_sb_info *cifs_sb;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +301,7 @@ static int cifs_permission(struct inode *inode, int mask)
                on the client (above and beyond ACL on servers) for
                servers which do not support setting and viewing mode bits,
                so allowing client to check permissions is useful */
-                return generic_permission(inode, mask, NULL);
+                return generic_permission(inode, mask, flags, NULL);
 }
 static struct kmem_cache *cifs_inode_cachep;
@@ -326,6 +329,8 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
+        cifs_inode->uniqueid = 0;
+        cifs_inode->createtime = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -334,10 +339,17 @@ cifs_alloc_inode(struct super_block *sb)
        return &cifs_inode->vfs_inode;
 }
+static void cifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+        call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 static void
@@ -351,18 +363,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        seq_printf(s, ",addr=");
-        switch (server->addr.sockAddr.sin_family) {
+        switch (server->dstaddr.ss_family) {
        case AF_INET:
-                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
                break;
        case AF_INET6:
-                seq_printf(s, "%pI6",
+                seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
-                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (sa6->sin6_scope_id)
-                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
-                        seq_printf(s, "%%%u",
-                                   server->addr.sockAddr6.sin6_scope_id);
                break;
        default:
                seq_printf(s, "(unknown)");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7136c0c3e2f..606ca8bb710 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -163,10 +163,7 @@ struct TCP_Server_Info {
        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
-        union {
+        struct sockaddr_storage dstaddr;
-                struct sockaddr_in sockAddr;
-                struct sockaddr_in6 sockAddr6;
-        } addr;
        struct sockaddr_storage srcaddr; /* locally bind to this IP */
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
@@ -210,7 +207,7 @@ struct TCP_Server_Info {
        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        __u32 sequence_number; /* needed for CIFS PDU signature */
+        __u32 sequence_number; /* for signing, protected by srv_mutex */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
        u16 dialect; /* dialect index that server chose */
@@ -456,6 +453,7 @@ struct cifsInodeInfo {
        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+        u64  createtime;                /* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie *fscache;
 #endif
@@ -576,6 +574,7 @@ struct cifs_fattr {
        u64             cf_uniqueid;
        u64             cf_eof;
        u64             cf_bytes;
+        u64             cf_createtime;
        uid_t           cf_uid;
        gid_t           cf_gid;
        umode_t         cf_mode;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 67acfb3acad..2f6795e524d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -401,15 +401,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        }
+        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
-#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index cc1a8604a79..a65d311d163 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -64,8 +64,8 @@ struct smb_vol {
        char *UNC;
        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[16]; /* netbios name of client */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
@@ -115,8 +115,8 @@ struct smb_vol {
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
-static int ipv4_connect(struct TCP_Server_Info *server);
+static int ip_connect(struct TCP_Server_Info *server);
-static int ipv6_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
@@ -200,10 +200,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(server);
+                /* we should try only the port we connected to before */
-                else
+                rc = generic_ip_connect(server);
-                        rc = ipv4_connect(server);
                if (rc) {
                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
@@ -477,7 +476,7 @@ incomplete_rcv:
                         * initialize frame)
                         */
                        cifs_set_port((struct sockaddr *)
-                                        &server->addr.sockAddr, CIFS_PORT);
+                                        &server->dstaddr, CIFS_PORT);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -817,11 +816,11 @@ cifs_parse_mount_options(char *options, const char *devname,
         * informational, only used for servers that do not support
         * port 445 and it can be overridden at mount time
         */
-        memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
-        for (i = 0; i < strnlen(nodename, 15); i++)
+        for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-        vol->source_rfc1001_name[15] = 0;
+        vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
@@ -985,13 +984,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -1168,22 +1165,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value || (*value == ' ')) {
                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
-                                memset(vol->source_rfc1001_name, 0x20, 15);
+                                memset(vol->source_rfc1001_name, 0x20,
-                                for (i = 0; i < 15; i++) {
+                                        RFC1001_NAME_LEN);
-                                /* BB are there cases in which a comma can be
+                                /*
-                                valid in this workstation netbios name (and need
+                                 * FIXME: are there cases in which a comma can
-                                special handling)? */
+                                 * be valid in workstation netbios name (and
+                                 * need special handling)?
-                                /* We do not uppercase netbiosname for user */
+                                 */
+                                for (i = 0; i < RFC1001_NAME_LEN; i++) {
+                                        /* don't ucase netbiosname for user */
                                        if (value[i] == 0)
                                                break;
-                                        else
+                                        vol->source_rfc1001_name[i] = value[i];
-                                                vol->source_rfc1001_name[i] =
-                                                                value[i];
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: netbiosname"
                                                " longer than 15 truncated.\n");
                        }
@@ -1193,7 +1190,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
-                                memset(vol->target_rfc1001_name, 0x20, 16);
+                                memset(vol->target_rfc1001_name, 0x20,
+                                        RFC1001_NAME_LEN_WITH_NULL);
                                for (i = 0; i < 15; i++) {
                                /* BB are there cases in which a comma can be
@@ -1210,7 +1208,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
@@ -1341,10 +1339,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl", 5) == 0) {
                        vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                } else if (strnicmp(data, "locallease", 6) == 0) {
                        vol->local_lease = 1;
-#endif
                } else if (strnicmp(data, "sign", 4) == 0) {
                        vol->secFlg |= CIFSSEC_MUST_SIGN;
                } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1454,35 +1450,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
 }
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        unsigned short int port, *sport;
+        switch (addr->sa_family) {
+        case AF_INET:
+                sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+                port = ((struct sockaddr_in *) addr)->sin_port;
+                break;
+        case AF_INET6:
+                sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+                port = ((struct sockaddr_in6 *) addr)->sin6_port;
+                break;
+        default:
+                WARN_ON(1);
+                return false;
+        }
+        if (!port) {
+                port = htons(CIFS_PORT);
+                if (port == *sport)
+                        return true;
+                port = htons(RFC1001_PORT);
+        }
+        return port == *sport;
+}
 static bool
 match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
              struct sockaddr *srcaddr)
 {
-        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
        switch (addr->sa_family) {
-        case AF_INET:
+        case AF_INET: {
-                if (addr4->sin_addr.s_addr !=
+                struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-                    server->addr.sockAddr.sin_addr.s_addr)
+                struct sockaddr_in *srv_addr4 =
-                        return false;
+                                        (struct sockaddr_in *)&server->dstaddr;
-                if (addr4->sin_port &&
-                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
                        return false;
                break;
-        case AF_INET6:
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+                struct sockaddr_in6 *srv_addr6 =
+                                        (struct sockaddr_in6 *)&server->dstaddr;
                if (!ipv6_addr_equal(&addr6->sin6_addr,
-                                     &server->addr.sockAddr6.sin6_addr))
+                                     &srv_addr6->sin6_addr))
                        return false;
-                if (addr6->sin6_scope_id !=
+                if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
-                    server->addr.sockAddr6.sin6_scope_id)
-                        return false;
-                if (addr6->sin6_port &&
-                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
                        return false;
                break;
        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
                return false;
@@ -1549,6 +1581,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
                                   (struct sockaddr *)&vol->srcaddr))
                        continue;
+                if (!match_port(server, addr))
+                        continue;
                if (!match_security(server, vol))
                        continue;
@@ -1681,14 +1716,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                memcpy(&tcp_ses->dstaddr, sin_server6,
-                        sizeof(struct sockaddr_in6));
+                       sizeof(struct sockaddr_in6));
-                rc = ipv6_connect(tcp_ses);
+        } else
-        } else {
+                memcpy(&tcp_ses->dstaddr, sin_server,
-                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                       sizeof(struct sockaddr_in));
-                        sizeof(struct sockaddr_in));
-                rc = ipv4_connect(tcp_ses);
+        rc = ip_connect(tcp_ses);
-        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err_crypto_release;
@@ -1793,6 +1827,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
        struct cifsSesInfo *ses;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        xid = GetXid();
@@ -1836,12 +1872,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        /* new SMB session uses our server ref */
        ses->server = server;
-        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+        if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(ses->serverName, "%pI6",
+                sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
-                        &server->addr.sockAddr6.sin6_addr);
        else
-                sprintf(ses->serverName, "%pI4",
+                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-                        &server->addr.sockAddr.sin_addr.s_addr);
        if (volume_info->username)
                strncpy(ses->userName, volume_info->username,
@@ -2136,19 +2170,106 @@ bind_socket(struct TCP_Server_Info *server)
 }
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        /*
+         * some servers require RFC1001 sessinit before sending
+         * negprot - BB check reconnection in case where second
+         * sessinit is sent but no second negprot
+         */
+        struct rfc1002_session_packet *ses_init_buf;
+        struct smb_hdr *smb_buf;
+        ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+                               GFP_KERNEL);
+        if (ses_init_buf) {
+                ses_init_buf->trailer.session_req.called_len = 32;
+                if (server->server_RFC1001_name &&
+                    server->server_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      server->server_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      DEFAULT_CIFS_CALLED_NAME,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.calling_len = 32;
+                /*
+                 * calling name ends in null (byte 16) from old smb
+                 * convention.
+                 */
+                if (server->workstation_RFC1001_name &&
+                    server->workstation_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      server->workstation_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      "LINUX_CIFS_CLNT",
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.scope1 = 0;
+                ses_init_buf->trailer.session_req.scope2 = 0;
+                smb_buf = (struct smb_hdr *)ses_init_buf;
+                /* sizeof RFC1002_SESSION_REQUEST with no scope */
+                smb_buf->smb_buf_length = 0x81000044;
+                rc = smb_send(server, smb_buf, 0x44);
+                kfree(ses_init_buf);
+                /*
+                 * RFC1001 layer in at least one server
+                 * requires very short break before negprot
+                 * presumably because not expecting negprot
+                 * to follow so fast.  This is a simple
+                 * solution that works without
+                 * complicating the code and causes no
+                 * significant slowing down on mount
+                 * for everyone else
+                 */
+                usleep_range(1000, 2000);
+        }
+        /*
+         * else the negprot may still work without this
+         * even though malloc failed
+         */
+        return rc;
+}
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        int val;
+        unsigned short int sport;
-        bool connected = false;
+        int slen, sfamily;
-        __be16 orig_port = 0;
        struct socket *socket = server->ssocket;
+        struct sockaddr *saddr;
+        saddr = (struct sockaddr *) &server->dstaddr;
+        if (server->dstaddr.ss_family == AF_INET6) {
+                sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+                slen = sizeof(struct sockaddr_in6);
+                sfamily = AF_INET6;
+        } else {
+                sport = ((struct sockaddr_in *) saddr)->sin_port;
+                slen = sizeof(struct sockaddr_in);
+                sfamily = AF_INET;
+        }
        if (socket == NULL) {
-                rc = sock_create_kern(PF_INET, SOCK_STREAM,
+                rc = sock_create_kern(sfamily, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
+                        server->ssocket = NULL;
                        return rc;
                }
@@ -2156,63 +2277,28 @@ ipv4_connect(struct TCP_Server_Info *server)
                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket4(socket);
+                if (sfamily == AF_INET6)
+                        cifs_reclassify_socket6(socket);
+                else
+                        cifs_reclassify_socket4(socket);
        }
        rc = bind_socket(server);
        if (rc < 0)
                return rc;
-        /* user overrode default port */
+        rc = socket->ops->connect(socket, saddr, slen, 0);
-        if (server->addr.sockAddr.sin_port) {
+        if (rc < 0) {
-                rc = socket->ops->connect(socket, (struct sockaddr *)
+                cFYI(1, "Error %d connecting to server", rc);
-                                          &server->addr.sockAddr,
-                                          sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr.sin_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket,
-                                                (struct sockaddr *)
-                                                &server->addr.sockAddr,
-                                                sizeof(struct sockaddr_in), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              &server->addr.sockAddr,
-                                              sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
        /*
         * Eventually check for other socket options to change from
-         *  the default. sock_setsockopt not used because it expects
+         * the default. sock_setsockopt not used because it expects
-         *  user space buffer
+         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2226,7 +2312,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        }
        if (server->tcp_nodelay) {
-                val = 1;
+                int val = 1;
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
@@ -2237,161 +2323,39 @@ ipv4_connect(struct TCP_Server_Info *server)
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
-        /* send RFC1001 sessinit */
+        if (sport == htons(RFC1001_PORT))
-        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
+                rc = ip_rfc1001_connect(server);
-                /* some servers require RFC1001 sessinit before sending
-                negprot - BB check reconnection in case where second
-                sessinit is sent but no second negprot */
-                struct rfc1002_session_packet *ses_init_buf;
-                struct smb_hdr *smb_buf;
-                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-                                       GFP_KERNEL);
-                if (ses_init_buf) {
-                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (server->server_RFC1001_name &&
-                            server->server_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              server->server_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              DEFAULT_CIFS_CALLED_NAME,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.calling_len = 32;
-                        /* calling name ends in null (byte 16) from old smb
-                        convention. */
-                        if (server->workstation_RFC1001_name &&
-                            server->workstation_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              server->workstation_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              "LINUX_CIFS_CLNT",
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.scope1 = 0;
-                        ses_init_buf->trailer.session_req.scope2 = 0;
-                        smb_buf = (struct smb_hdr *)ses_init_buf;
-                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(server, smb_buf, 0x44);
-                        kfree(ses_init_buf);
-                        msleep(1); /* RFC1001 layer in at least one server
-                                      requires very short break before negprot
-                                      presumably because not expecting negprot
-                                      to follow so fast.  This is a simple
-                                      solution that works without
-                                      complicating the code and causes no
-                                      significant slowing down on mount
-                                      for everyone else */
-                }
-                /* else the negprot may still work without this
-                even though malloc failed */
-        }
        return rc;
 }
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-        int rc = 0;
+        unsigned short int *sport;
-        int val;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
-        bool connected = false;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
-        __be16 orig_port = 0;
-        struct socket *socket = server->ssocket;
-        if (socket == NULL) {
+        if (server->dstaddr.ss_family == AF_INET6)
-                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
+                sport = &addr6->sin6_port;
-                                      IPPROTO_TCP, &socket);
+        else
-                if (rc < 0) {
+                sport = &addr->sin_port;
-                        cERROR(1, "Error %d creating ipv6 socket", rc);
-                        socket = NULL;
-                        return rc;
-                }
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
+        if (*sport == 0) {
-                cFYI(1, "ipv6 Socket created");
+                int rc;
-                server->ssocket = socket;
-                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket6(socket);
-        }
-        rc = bind_socket(server);
+                /* try with 445 port at first */
-        if (rc < 0)
+                *sport = htons(CIFS_PORT);
-                return rc;
-        /* user overrode default port */
+                rc = generic_ip_connect(server);
-        if (server->addr.sockAddr6.sin6_port) {
-                rc = socket->ops->connect(socket,
-                                (struct sockaddr *) &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr6.sin6_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                                        &server->addr.sockAddr6,
-                                        sizeof(struct sockaddr_in6), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = true;
+                        return rc;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv6", rc);
-                sock_release(socket);
-                server->ssocket = NULL;
-                return rc;
-        }
-        /*
-         * Eventually check for other socket options to change from
-         * the default. sock_setsockopt not used because it expects
-         * user space buffer
-         */
-        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 5 * HZ;
-        if (server->tcp_nodelay) {
+                /* if it failed, try with 139 port */
-                val = 1;
+                *sport = htons(RFC1001_PORT);
-                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-                                (char *)&val, sizeof(val));
-                if (rc)
-                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-        server->ssocket = socket;
+        return generic_ip_connect(server);
-        return rc;
 }
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3840eddbfb7..2e773825835 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -135,9 +135,9 @@ static void setup_cifs_dentry(struct cifsTconInfo *tcon,
                              struct inode *newinode)
 {
        if (tcon->nocase)
-                direntry->d_op = &cifs_ci_dentry_ops;
+                d_set_d_op(direntry, &cifs_ci_dentry_ops);
        else
-                direntry->d_op = &cifs_dentry_ops;
+                d_set_d_op(direntry, &cifs_dentry_ops);
        d_instantiate(direntry, newinode);
 }
@@ -293,10 +293,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-                                        cifs_sb->local_nls,
+                                        current->tgid);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -421,9 +419,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                if (rc == 0)
                        d_instantiate(direntry, newinode);
@@ -604,9 +602,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        if ((rc == 0) && (newInode != NULL)) {
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, newInode);
                if (posix_open) {
                        filp = lookup_instantiate_filp(nd, direntry,
@@ -634,9 +632,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                rc = 0;
                direntry->d_time = jiffies;
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
@@ -656,22 +654,37 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
-        int isValid = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        if (direntry->d_inode) {
                if (cifs_revalidate_dentry(direntry))
                        return 0;
-        } else {
+                else
-                cFYI(1, "neg dentry 0x%p name = %s",
+                        return 1;
-                         direntry, direntry->d_name.name);
+        }
-                if (time_after(jiffies, direntry->d_time + HZ) ||
-                        !lookupCacheEnabled) {
+        /*
-                        d_drop(direntry);
+         * This may be nfsd (or something), anyway, we can't see the
-                        isValid = 0;
+         * intent of this. So, since this can be for creation, drop it.
-                }
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
        }
-        return isValid;
+        if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+                return 0;
+        return 1;
 }
 /* static int cifs_d_delete(struct dentry *direntry)
@@ -688,9 +701,10 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *q)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
        unsigned long hash;
        int i;
@@ -703,21 +717,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
        return 0;
 }
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
+static int cifs_ci_compare(const struct dentry *parent,
-                           struct qstr *b)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
-        if ((a->len == b->len) &&
+        if ((name->len == len) &&
-            (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) {
+            (nls_strnicmp(codepage, name->name, str, len) == 0))
-                /*
-                 * To preserve case, don't let an existing negative dentry's
-                 * case take precedence.  If a is not a negative dentry, this
-                 * should have no side effects
-                 */
-                memcpy((void *)a->name, b->name, a->len);
                return 0;
-        }
        return 1;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a28660ca2b..d843631c028 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
-        char *full_path, int xid)
-{
-        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-        struct timespec temp;
-        int rc;
-        if (pCifsInode->clientCanCacheRead) {
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto client_can_cache;
-        }
-        /* BB need same check in cifs_create too? */
-        /* if not oplocked, invalidate inode pages if mtime or file
-           size changed */
-        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-        if (timespec_equal(&inode->i_mtime, &temp) &&
-                           (inode->i_size ==
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, "inode unchanged on server");
-        } else {
-                if (inode->i_mapping) {
-                        /* BB no need to lock inode until after invalidate
-                        since namei code should already have it locked? */
-                        rc = filemap_write_and_wait(inode->i_mapping);
-                        mapping_set_error(inode->i_mapping, rc);
-                }
-                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed");
-                invalidate_remote_inode(inode);
-        }
-client_can_cache:
-        if (pTcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-                                              xid);
-        else
-                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, NULL);
-        cifs_set_oplock_level(pCifsInode, oplock);
-        return rc;
-}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                        struct super_block *sb, int mode, unsigned int f_flags,
                        __u32 *poplock, __u16 *pnetfid, int xid)
@@ -213,6 +166,76 @@ posix_open_ret:
        return rc;
 }
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             __u16 *pnetfid, int xid)
+{
+        int rc;
+        int desiredAccess;
+        int disposition;
+        FILE_ALL_INFO *buf;
+        desiredAccess = cifs_convert_flags(f_flags);
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *      POSIX Flag            CIFS Disposition
+ *      ----------            ----------------
+ *      O_CREAT               FILE_OPEN_IF
+ *      O_CREAT | O_EXCL      FILE_CREATE
+ *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *      O_TRUNC               FILE_OVERWRITE
+ *      none of the above     FILE_OPEN
+ *
+ *      Note that there is not a direct match between disposition
+ *      FILE_SUPERSEDE (ie create whether or not file exists although
+ *      O_CREAT | O_TRUNC is similar but truncates the existing
+ *      file rather than creating a new file as FILE_SUPERSEDE does
+ *      (which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+        disposition = cifs_get_disposition(f_flags);
+        /* BB pass O_SYNC flag through on file attributes .. BB */
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+                        desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto out;
+        if (tcon->unix_ext)
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                                              xid);
+        else
+                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+                                         xid, pnetfid);
+out:
+        kfree(buf);
+        return rc;
+}
 struct cifsFileInfo *
 cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                  struct tcon_link *tlink, __u32 oplock)
@@ -317,10 +340,8 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
-        int desiredAccess;
+        bool posix_open_ok = false;
-        int disposition;
        __u16 netfid;
-        FILE_ALL_INFO *buf = NULL;
        xid = GetXid();
@@ -358,17 +379,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
+                        posix_open_ok = true;
-                        pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-                                                      oplock);
-                        if (pCifsFile == NULL) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                rc = -ENOMEM;
-                        }
-                        cifs_fscache_set_inode_cookie(inode, file);
-                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
                                cERROR(1, "server %s of type %s returned"
@@ -385,103 +396,39 @@ int cifs_open(struct inode *inode, struct file *file)
                   or DFS errors */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        if (!posix_open_ok) {
+                rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-/*********************************************************************
+                                  file->f_flags, &oplock, &netfid, xid);
- *  open flag mapping table:
+                if (rc)
- *
+                        goto out;
- *      POSIX Flag            CIFS Disposition
- *      ----------            ----------------
- *      O_CREAT               FILE_OPEN_IF
- *      O_CREAT | O_EXCL      FILE_CREATE
- *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *      O_TRUNC               FILE_OVERWRITE
- *      none of the above     FILE_OPEN
- *
- *      Note that there is not a direct match between disposition
- *      FILE_SUPERSEDE (ie create whether or not file exists although
- *      O_CREAT | O_TRUNC is similar but truncates the existing
- *      file rather than creating a new file as FILE_SUPERSEDE does
- *      (which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-        disposition = cifs_get_disposition(file->f_flags);
-        /* BB pass O_SYNC flag through on file attributes .. BB */
-        /* Also refresh inode by passing in file_info buf returned by SMBOpen
-           and calling get_inode_info with returned buf (at least helps
-           non-Unix server case) */
-        /* BB we can not do this if this is the second open of a file
-           and the first handle has writebehind data, we might be
-           able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        if (tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else
-                rc = -EIO; /* no NT SMB support fall into legacy open below */
-        if (rc == -EIO) {
-                /* Old server, try legacy style OpenX */
-                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-                        desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        }
-        if (rc) {
-                cFYI(1, "cifs_open returned 0x%x", rc);
-                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
-        if (rc != 0)
-                goto out;
        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
        if (pCifsFile == NULL) {
+                CIFSSMBClose(xid, tcon, netfid);
                rc = -ENOMEM;
                goto out;
        }
        cifs_fscache_set_inode_cookie(inode, file);
-        if (oplock & CIFS_CREATE_ACTION) {
+        if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
-                if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
-                        struct cifs_unix_set_info_args args = {
+                        .mode   = inode->i_mode,
-                                .mode   = inode->i_mode,
+                        .uid    = NO_CHANGE_64,
-                                .uid    = NO_CHANGE_64,
+                        .gid    = NO_CHANGE_64,
-                                .gid    = NO_CHANGE_64,
+                        .ctime  = NO_CHANGE_64,
-                                .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
+                        .device = 0,
-                                .device = 0,
+                };
-                        };
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                        pCifsFile->pid);
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                }
        }
 out:
-        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
        cifs_put_tlink(tlink);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 589f3e3f6e0..0c7e36910e3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -518,6 +518,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -779,6 +780,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /* use createtime like an i_generation field */
+        if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+                return 0;
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
                return 0;
@@ -796,6 +801,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        CIFS_I(inode)->createtime = fattr->cf_createtime;
        return 0;
 }
@@ -809,14 +815,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        return true;
                }
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return false;
 }
@@ -1319,9 +1325,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        to set uid/gid */
                        inc_nlink(inode);
                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
                        else
-                                direntry->d_op = &cifs_dentry_ops;
+                                d_set_d_op(direntry, &cifs_dentry_ops);
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
                        cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1363,9 +1369,9 @@ mkdir_get_info:
                                                 inode->i_sb, xid, NULL);
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_instantiate(direntry, newinode);
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 85cdbf831e7..fe2f6a93c49 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -525,9 +525,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                              rc);
                } else {
                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
                        else
-                                direntry->d_op = &cifs_dentry_ops;
+                                d_set_d_op(direntry, &cifs_dentry_ops);
                        d_instantiate(direntry, newinode);
                }
        }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a73eb9f4bda..76b1b37c9e6 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
-                parent->d_op->d_hash(parent, name);
+                parent->d_op->d_hash(parent, parent->d_inode, name);
        else
                name->hash = full_name_hash(name->name, name->len);
@@ -103,9 +103,9 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        }
        if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-                dentry->d_op = &cifs_ci_dentry_ops;
+                d_set_d_op(dentry, &cifs_ci_dentry_ops);
        else
-                dentry->d_op = &cifs_dentry_ops;
+                d_set_d_op(dentry, &cifs_dentry_ops);
        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
@@ -160,6 +160,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed..eb746486e49 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
+        memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmNegotiate;
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        }
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->WorkstationName.BufferOffset = 0;
        sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->LmChallengeResponse.BufferOffset =
                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+        if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
-                        !calc_seckey(ses)) {
+                (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+                        && !calc_seckey(ses)) {
                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
        return rc;
 }
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
-                                 struct cifsSesInfo *ses)
-{
-        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-        return;
-}
-#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
               const struct nls_table *nls_cp)
@@ -814,71 +804,70 @@ ssetup_ntlmssp_authenticate:
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-        } else {
+        } else if (type == RawNTLMSSP) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                if (type == RawNTLMSSP) {
+                        cERROR(1, "NTLMSSP requires Unicode support");
-                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                        rc = -ENOSYS;
-                                cERROR(1, "NTLMSSP requires Unicode support");
+                        goto ssetup_exit;
-                                rc = -ENOSYS;
+                }
+                cFYI(1, "ntlmssp session setup phase %d", phase);
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                switch(phase) {
+                case NtLmNegotiate:
+                        build_ntlmssp_negotiate_blob(
+                                pSMB->req.SecurityBlob, ses);
+                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        iov[1].iov_base = pSMB->req.SecurityBlob;
+                        pSMB->req.SecurityBlobLength =
+                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                        break;
+                case NtLmAuthenticate:
+                        /*
+                         * 5 is an empirical value, large enough to hold
+                         * authenticate message plus max 10 of av paris,
+                         * domain, user, workstation names, flags, etc.
+                         */
+                        ntlmsspblob = kzalloc(
+                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+                        if (!ntlmsspblob) {
+                                cERROR(1, "Can't allocate NTLMSSP blob");
+                                rc = -ENOMEM;
                                goto ssetup_exit;
                        }
-                        cFYI(1, "ntlmssp session setup phase %d", phase);
+                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                                                &blob_len, ses, nls_cp);
-                        capabilities |= CAP_EXTENDED_SECURITY;
+                        if (rc)
-                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-                        if (phase == NtLmNegotiate) {
-                                setup_ntlmssp_neg_req(pSMB, ses);
-                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
-                        } else if (phase == NtLmAuthenticate) {
-                                /* 5 is an empirical value, large enought to
-                                 * hold authenticate message, max 10 of
-                                 * av paris, doamin,user,workstation mames,
-                                 * flags etc..
-                                 */
-                                ntlmsspblob = kmalloc(
-                                        5*sizeof(struct _AUTHENTICATE_MESSAGE),
-                                        GFP_KERNEL);
-                                if (!ntlmsspblob) {
-                                        cERROR(1, "Can't allocate NTLMSSP");
-                                        rc = -ENOMEM;
-                                        goto ssetup_exit;
-                                }
-                                rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                                                        &blob_len, ses, nls_cp);
-                                if (rc)
-                                        goto ssetup_exit;
-                                iov[1].iov_len = blob_len;
-                                iov[1].iov_base = ntlmsspblob;
-                                pSMB->req.SecurityBlobLength =
-                                        cpu_to_le16(blob_len);
-                                /* Make sure that we tell the server that we
-                                   are using the uid that it just gave us back
-                                   on the response (challenge) */
-                                smb_buf->Uid = ses->Suid;
-                        } else {
-                                cERROR(1, "invalid phase %d", phase);
-                                rc = -ENOSYS;
                                goto ssetup_exit;
-                        }
+                        iov[1].iov_len = blob_len;
-                        /* unicode strings must be word aligned */
+                        iov[1].iov_base = ntlmsspblob;
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
-                                *bcc_ptr = 0;
+                        /*
-                                bcc_ptr++;
+                         * Make sure that we tell the server that we are using
-                        }
+                         * the uid that it just gave us back on the response
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                         * (challenge)
-                } else {
+                         */
-                        cERROR(1, "secType %d not supported!", type);
+                        smb_buf->Uid = ses->Suid;
+                        break;
+                default:
+                        cERROR(1, "invalid phase %d", phase);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
-#else
+                /* unicode strings must be word aligned */
+                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, nls_cp);
+        } else {
                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
-#endif
        }
        iov[2].iov_base = str_area;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc..59ca81b1691 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,7 +119,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 9060f08e70c..5525e1c660f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
        struct list_head *child;
        struct dentry *de;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        list_for_each(child, &parent->d_subdirs)
        {
                de = list_entry(child, struct dentry, d_u.d_child);
@@ -102,7 +102,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
                        continue;
                coda_flag_inode(de->d_inode, flag);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return; 
 }
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b3553960..29badd91360 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
@@ -47,7 +48,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -125,7 +126,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                return ERR_PTR(error);
 exit:
-        entry->d_op = &coda_dentry_operations;
+        d_set_d_op(entry, &coda_dentry_operations);
        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
@@ -134,10 +135,13 @@ exit:
 }
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
@@ -541,9 +545,13 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode;
        struct coda_inode_info *cii;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = de->d_inode;
        if (!inode || coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
@@ -559,7 +567,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
-        if (atomic_read(&de->d_count) > 1)
+        if (de->d_count > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
@@ -577,7 +585,7 @@ out:
 * This is the callback from dput() when d_count is going to 0.
 * We use this to unhash dentries with bad inodes.
 */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
        int flags;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1a49c1708a5..f065a5d31a1 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -56,11 +56,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
+static void coda_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, coda_i_callback);
+}
 static void init_once(void *foo)
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7..741f0bd0391 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include <linux/coda_psdev.h>
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
                        unsigned long user_data);
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
 };
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a60579b007b..61abb638b4b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -42,7 +42,7 @@
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/blkdev.h>
@@ -836,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
 COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df4..026cf68553a 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -120,7 +120,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
        struct config_item * item = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_unhashed(dentry)) {
                struct configfs_dirent * sd = dentry->d_fsdata;
                if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +129,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
                } else
                        item = config_item_get(sd->s_element);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return item;
 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c69..36637a8c1ed 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,7 @@ static void configfs_d_iput(struct dentry * dentry,
 * We _must_ delete our dentries on last dput, as the chain-to-parent
 * behavior is required to clear the parents of default_groups.
 */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
        sd->s_mode = mode;
        sd->s_dentry = dentry;
-        if (dentry) {
+        if (dentry)
                dentry->d_fsdata = configfs_get(sd);
-                dentry->d_op = &configfs_dentry_ops;
-        }
        return 0;
 }
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
-                        (d)->d_op = &configfs_dentry_ops;
                } else {
                        struct configfs_dirent *sd = d->d_fsdata;
                        if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
                                   CONFIGFS_ITEM_LINK);
        if (!err) {
                err = configfs_create(dentry, mode, init_symlink);
-                if (!err)
+                if (err) {
-                        dentry->d_op = &configfs_dentry_ops;
-                else {
                        struct configfs_dirent *sd = dentry->d_fsdata;
                        if (sd) {
                                spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
-        pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
-                 atomic_read(&d->d_count));
        dput(parent);
 }
@@ -448,7 +442,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                return error;
        }
-        dentry->d_op = &configfs_dentry_ops;
+        d_set_d_op(dentry, &configfs_dentry_ops);
        d_rehash(dentry);
        return 0;
@@ -493,7 +487,11 @@ static struct dentry * configfs_lookup(struct inode *dir,
                 * If it doesn't exist and it isn't a NOT_PINNED item,
                 * it must be negative.
                 */
-                return simple_lookup(dir, dentry, nd);
+                if (dentry->d_name.len > NAME_MAX)
+                        return ERR_PTR(-ENAMETOOLONG);
+                d_set_d_op(dentry, &configfs_dentry_ops);
+                d_add(dentry, NULL);
+                return NULL;
        }
 out:
@@ -685,6 +683,7 @@ static int create_default_group(struct config_group *parent_group,
        ret = -ENOMEM;
        child = d_alloc(parent, &name);
        if (child) {
+                d_set_d_op(child, &configfs_dentry_ops);
                d_add(child, NULL);
                ret = configfs_attach_group(&parent_group->cg_item,
@@ -1682,6 +1681,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        err = -ENOMEM;
        dentry = d_alloc(configfs_sb->s_root, &name);
        if (dentry) {
+                d_set_d_op(dentry, &configfs_dentry_ops);
                d_add(dentry, NULL);
                err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 253476d78ed..c83f4768eea 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
        struct dentry * dentry = sd->s_dentry;
        if (dentry) {
-                spin_lock(&dcache_lock);
                spin_lock(&dentry->d_lock);
                if (!(d_unhashed(dentry) && dentry->d_inode)) {
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        simple_unlink(parent->d_inode, dentry);
-                } else {
+                } else
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
-                }
        }
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0..e141939080f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
-/* These two macros may change in future, to provide better st_ino
+/* These macros may change in future, to provide better st_ino semantics. */
-   semantics. */
-#define CRAMINO(x)      (((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 {
+        if (!cino->offset)
+                return offset + 1;
+        if (!cino->size)
+                return offset + 1;
+        /*
+         * The file mode test fixes buggy mkcramfs implementations where
+         * cramfs_inode->offset is set to a non zero value for entries
+         * which did not contain data, like devices node and fifos.
+         */
+        switch (cino->mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                return cino->offset << 2;
+        default:
+                break;
+        }
+        return offset + 1;
+}
+static struct inode *get_cramfs_inode(struct super_block *sb,
+        struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+        struct inode *inode;
        static struct timespec zerotime;
+        inode = iget_locked(sb, cramino(cramfs_inode, offset));
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        switch (cramfs_inode->mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_fop = &generic_ro_fops;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        case S_IFDIR:
+                inode->i_op = &cramfs_dir_inode_operations;
+                inode->i_fop = &cramfs_directory_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        default:
+                init_special_inode(inode, cramfs_inode->mode,
+                                old_decode_dev(cramfs_inode->size));
+        }
        inode->i_mode = cramfs_inode->mode;
        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
        inode->i_gid = cramfs_inode->gid;
+        /* if the lower 2 bits are zero, the inode contains data */
+        if (!(inode->i_ino & 3)) {
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+        }
        /* Struct copy intentional */
        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        /* inode->i_nlink is left 1 - arguably wrong for directories,
           but it's the best we can do without reading the directory
           contents.  1 yields the right result in GNU find, even
           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
-}
-static struct inode *get_cramfs_inode(struct super_block *sb,
+        unlock_new_inode(inode);
-                                struct cramfs_inode * cramfs_inode)
-{
-        struct inode *inode;
-        if (CRAMINO(cramfs_inode) == 1) {
-                inode = new_inode(sb);
-                if (inode) {
-                        inode->i_ino = 1;
-                        setup_inode(inode, cramfs_inode);
-                }
-        } else {
-                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode && (inode->i_state & I_NEW)) {
-                        setup_inode(inode, cramfs_inode);
-                        unlock_new_inode(inode);
-                }
-        }
        return inode;
 }
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "cramfs: root is not a directory\n");
                goto out;
        }
+        /* correct strange, hard-coded permissions of mkcramfs */
+        super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        root_offset = super.root.offset << 2;
        if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
                sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Set it all up.. */
        sb->s_op = &cramfs_ops;
-        root = get_cramfs_inode(sb, &super.root);
+        root = get_cramfs_inode(sb, &super.root, 0);
        if (!root)
                goto out;
        sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                namelen = de->namelen << 2;
                memcpy(buf, name, namelen);
-                ino = CRAMINO(de);
+                ino = cramino(de, OFFSET(inode) + offset);
                mode = de->mode;
                mutex_unlock(&read_mutex);
                nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                struct cramfs_inode *de;
                char *name;
                int namelen, retval;
+                int dir_off = OFFSET(dir) + offset;
-                de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+                de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);
                /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                if (!retval) {
                        struct cramfs_inode entry = *de;
                        mutex_unlock(&read_mutex);
-                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
                        return NULL;
                }
                /* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9d4e6..5699d4c027c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
-EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(rename_lock);
 static struct kmem_cache *dentry_cache __read_mostly;
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 /*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,22 +98,51 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
+struct dcache_hash_bucket {
+        struct hlist_bl_head head;
+};
+static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
+static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+                                        unsigned long hash)
+{
+        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+        return dentry_hashtable + (hash & D_HASHMASK);
+}
+static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
+{
+        bit_spin_lock(0, (unsigned long *)&b->head.first);
+}
+static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
+{
+        __bit_spin_unlock(0, (unsigned long *)&b->head.first);
+}
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_dentry, i);
+        return sum < 0 ? 0 : sum;
+}
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos)
 {
-        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_dentry = get_nr_dentry();
-        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -91,35 +158,50 @@ static void __d_free(struct rcu_head *head)
 }
 /*
- * no dcache_lock, please.
+ * no locks, please.
 */
 static void d_free(struct dentry *dentry)
 {
-        percpu_counter_dec(&nr_dentry);
+        BUG_ON(dentry->d_count);
+        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
-        if (hlist_unhashed(&dentry->d_hash))
+        if (hlist_bl_unhashed(&dentry->d_hash))
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+        assert_spin_locked(&dentry->d_lock);
+        /* Go through a barrier */
+        write_seqcount_barrier(&dentry->d_seq);
+}
 /*
 * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
 */
 static void dentry_iput(struct dentry * dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
        if (inode) {
                dentry->d_inode = NULL;
                list_del_init(&dentry->d_alias);
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                if (!inode->i_nlink)
                        fsnotify_inoderemove(inode);
                if (dentry->d_op && dentry->d_op->d_iput)
@@ -128,40 +210,72 @@ static void dentry_iput(struct dentry * dentry)
                        iput(inode);
        } else {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
        }
 }
 /*
- * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dentry->d_inode->i_lock)
+{
+        struct inode *inode = dentry->d_inode;
+        dentry->d_inode = NULL;
+        list_del_init(&dentry->d_alias);
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&inode->i_lock);
+        if (!inode->i_nlink)
+                fsnotify_inoderemove(inode);
+        if (dentry->d_op && dentry->d_op->d_iput)
+                dentry->d_op->d_iput(dentry, inode);
+        else
+                iput(inode);
+}
+/*
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
        if (list_empty(&dentry->d_lru)) {
+                spin_lock(&dcache_lru_lock);
                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
+                spin_unlock(&dcache_lru_lock);
        }
 }
+static void __dentry_lru_del(struct dentry *dentry)
+{
+        list_del_init(&dentry->d_lru);
+        dentry->d_sb->s_nr_dentry_unused--;
+        dentry_stat.nr_unused--;
+}
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                spin_lock(&dcache_lru_lock);
-                dentry->d_sb->s_nr_dentry_unused--;
+                __dentry_lru_del(dentry);
-                percpu_counter_dec(&nr_dentry_unused);
+                spin_unlock(&dcache_lru_lock);
        }
 }
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
+        spin_lock(&dcache_lru_lock);
        if (list_empty(&dentry->d_lru)) {
                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
        } else {
                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
+        spin_unlock(&dcache_lru_lock);
 }
 /**
@@ -171,22 +285,115 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 * The dentry must already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
+ *
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
 */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(parent->d_lock)
+        __releases(dentry->d_inode->i_lock)
 {
-        struct dentry *parent;
+        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
-        /*drops the locks, at that point nobody can reach this dentry */
+        if (parent)
+                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
+        /*
+         * dentry_iput drops the locks, at which point nobody (except
+         * transient RCU lookups) can reach this dentry.
+         */
+        d_free(dentry);
+        return parent;
+}
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+        if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+                        bit_spin_lock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_init(&dentry->d_hash);
+                        __bit_spin_unlock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                } else {
+                        struct dcache_hash_bucket *b;
+                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
+                        spin_lock_bucket(b);
+                        /*
+                         * We may not actually need to put DCACHE_UNHASHED
+                         * manipulations under the hash lock, but follow
+                         * the principle of least surprise.
+                         */
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_rcu(&dentry->d_hash);
+                        spin_unlock_bucket(b);
+                        dentry_rcuwalk_barrier(dentry);
+                }
+        }
+}
+EXPORT_SYMBOL(__d_drop);
+void d_drop(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __d_drop(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+        __releases(dentry->d_lock)
+{
+        struct inode *inode;
+        struct dentry *parent;
+        inode = dentry->d_inode;
+        if (inode && !spin_trylock(&inode->i_lock)) {
+relock:
+                spin_unlock(&dentry->d_lock);
+                cpu_relax();
+                return dentry; /* try again with same dentry */
+        }
        if (IS_ROOT(dentry))
                parent = NULL;
        else
                parent = dentry->d_parent;
-        d_free(dentry);
+        if (parent && !spin_trylock(&parent->d_lock)) {
-        return parent;
+                if (inode)
+                        spin_unlock(&inode->i_lock);
+                goto relock;
+        }
+        if (ref)
+                dentry->d_count--;
+        /* if dentry was on the d_lru list delete it from there */
+        dentry_lru_del(dentry);
+        /* if it was on the hash then remove it */
+        __d_drop(dentry);
+        return d_kill(dentry, parent);
 }
 /* 
@@ -214,34 +421,26 @@ static struct dentry *d_kill(struct dentry *dentry)
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
- *
- * no dcache lock, please.
 */
 void dput(struct dentry *dentry)
 {
        if (!dentry)
                return;
 repeat:
-        if (atomic_read(&dentry->d_count) == 1)
+        if (dentry->d_count == 1)
                might_sleep();
-        if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
-                return;
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count)) {
+        BUG_ON(!dentry->d_count);
+        if (dentry->d_count > 1) {
+                dentry->d_count--;
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                return;
        }
-        /*
+        if (dentry->d_flags & DCACHE_OP_DELETE) {
-         * AV: ->d_delete() is _NOT_ allowed to block now.
-         */
-        if (dentry->d_op && dentry->d_op->d_delete) {
                if (dentry->d_op->d_delete(dentry))
-                        goto unhash_it;
+                        goto kill_it;
        }
        /* Unreachable? Get rid of it */
@@ -252,16 +451,12 @@ repeat:
        dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
-        spin_unlock(&dentry->d_lock);
+        dentry->d_count--;
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return;
-unhash_it:
-        __d_drop(dentry);
 kill_it:
-        /* if dentry was on the d_lru list delete it from there */
+        dentry = dentry_kill(dentry, 1);
-        dentry_lru_del(dentry);
-        dentry = d_kill(dentry);
        if (dentry)
                goto repeat;
 }
@@ -284,9 +479,9 @@ int d_invalidate(struct dentry * dentry)
        /*
         * If it's already been dropped, return OK.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                return 0;
        }
        /*
@@ -294,9 +489,9 @@ int d_invalidate(struct dentry * dentry)
         * to get rid of unused child entries.
         */
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                shrink_dcache_parent(dentry);
-                spin_lock(&dcache_lock);
+                spin_lock(&dentry->d_lock);
        }
        /*
@@ -309,35 +504,61 @@ int d_invalidate(struct dentry * dentry)
         * we might still populate it if it was a
         * working directory or similar).
         */
-        spin_lock(&dentry->d_lock);
+        if (dentry->d_count > 1) {
-        if (atomic_read(&dentry->d_count) > 1) {
                if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        return -EBUSY;
                }
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
-static inline struct dentry * __dget_locked(struct dentry *dentry)
+static inline void __dget_dlock(struct dentry *dentry)
 {
-        atomic_inc(&dentry->d_count);
+        dentry->d_count++;
-        dentry_lru_del(dentry);
-        return dentry;
 }
-struct dentry * dget_locked(struct dentry *dentry)
+static inline void __dget(struct dentry *dentry)
 {
-        return __dget_locked(dentry);
+        spin_lock(&dentry->d_lock);
+        __dget_dlock(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+struct dentry *dget_parent(struct dentry *dentry)
+{
+        struct dentry *ret;
+repeat:
+        /*
+         * Don't need rcu_dereference because we re-check it was correct under
+         * the lock.
+         */
+        rcu_read_lock();
+        ret = dentry->d_parent;
+        if (!ret) {
+                rcu_read_unlock();
+                goto out;
+        }
+        spin_lock(&ret->d_lock);
+        if (unlikely(ret != dentry->d_parent)) {
+                spin_unlock(&ret->d_lock);
+                rcu_read_unlock();
+                goto repeat;
+        }
+        rcu_read_unlock();
+        BUG_ON(!ret->d_count);
+        ret->d_count++;
+        spin_unlock(&ret->d_lock);
+out:
+        return ret;
 }
-EXPORT_SYMBOL(dget_locked);
+EXPORT_SYMBOL(dget_parent);
 /**
 * d_find_alias - grab a hashed alias of inode
@@ -355,42 +576,51 @@ EXPORT_SYMBOL(dget_locked);
 * any other hashed alias over that one unless @want_discon is set,
 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
-static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
 {
-        struct list_head *head, *next, *tmp;
+        struct dentry *alias, *discon_alias;
-        struct dentry *alias, *discon_alias=NULL;
-        head = &inode->i_dentry;
+again:
-        next = inode->i_dentry.next;
+        discon_alias = NULL;
-        while (next != head) {
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-                tmp = next;
+                spin_lock(&alias->d_lock);
-                next = tmp->next;
-                prefetch(next);
-                alias = list_entry(tmp, struct dentry, d_alias);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
                        if (IS_ROOT(alias) &&
-                            (alias->d_flags & DCACHE_DISCONNECTED))
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                        else if (!want_discon) {
+                        } else if (!want_discon) {
-                                __dget_locked(alias);
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
+                                return alias;
+                        }
+                }
+                spin_unlock(&alias->d_lock);
+        }
+        if (discon_alias) {
+                alias = discon_alias;
+                spin_lock(&alias->d_lock);
+                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+                        if (IS_ROOT(alias) &&
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
                                return alias;
                        }
                }
+                spin_unlock(&alias->d_lock);
+                goto again;
        }
-        if (discon_alias)
+        return NULL;
-                __dget_locked(discon_alias);
-        return discon_alias;
 }
-struct dentry * d_find_alias(struct inode *inode)
+struct dentry *d_find_alias(struct inode *inode)
 {
        struct dentry *de = NULL;
        if (!list_empty(&inode->i_dentry)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode, 0);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
        }
        return de;
 }
@@ -404,54 +634,61 @@ void d_prune_aliases(struct inode *inode)
 {
        struct dentry *dentry;
 restart:
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
-                        __dget_locked(dentry);
+                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        dput(dentry);
                        goto restart;
                }
                spin_unlock(&dentry->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 /*
- * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * Try to throw away a dentry - free the inode, dput the parent.
- * the LRU list has already been removed.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
 *
- * Try to prune ancestors as well.  This is necessary to prevent
+ * This may fail if locks cannot be acquired no problem, just try again.
- * quadratic behavior of shrink_dcache_parent(), but is also expected
- * to be beneficial in reducing dentry cache fragmentation.
 */
-static void prune_one_dentry(struct dentry * dentry)
+static void try_prune_one_dentry(struct dentry *dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
-        __acquires(dcache_lock)
 {
-        __d_drop(dentry);
+        struct dentry *parent;
-        dentry = d_kill(dentry);
+        parent = dentry_kill(dentry, 0);
        /*
-         * Prune ancestors.  Locking is simpler than in dput(),
+         * If dentry_kill returns NULL, we have nothing more to do.
-         * because dcache_lock needs to be taken anyway.
+         * if it returns the same dentry, trylocks failed. In either
+         * case, just loop again.
+         *
+         * Otherwise, we need to prune ancestors too. This is necessary
+         * to prevent quadratic behavior of shrink_dcache_parent(), but
+         * is also expected to be beneficial in reducing dentry cache
+         * fragmentation.
         */
-        spin_lock(&dcache_lock);
+        if (!parent)
+                return;
+        if (parent == dentry)
+                return;
+        /* Prune ancestors. */
+        dentry = parent;
        while (dentry) {
-                if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_count > 1) {
+                        dentry->d_count--;
+                        spin_unlock(&dentry->d_lock);
                        return;
+                }
-                if (dentry->d_op && dentry->d_op->d_delete)
+                dentry = dentry_kill(dentry, 1);
-                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del(dentry);
-                __d_drop(dentry);
-                dentry = d_kill(dentry);
-                spin_lock(&dcache_lock);
        }
 }
@@ -459,24 +696,35 @@ static void shrink_dentry_list(struct list_head *list)
 {
        struct dentry *dentry;
-        while (!list_empty(list)) {
+        rcu_read_lock();
-                dentry = list_entry(list->prev, struct dentry, d_lru);
+        for (;;) {
-                dentry_lru_del(dentry);
+                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+                if (&dentry->d_lru == list)
+                        break; /* empty */
+                spin_lock(&dentry->d_lock);
+                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+                        spin_unlock(&dentry->d_lock);
+                        continue;
+                }
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
-                spin_lock(&dentry->d_lock);
+                if (dentry->d_count) {
-                if (atomic_read(&dentry->d_count)) {
+                        dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                prune_one_dentry(dentry);
-                /* dentry->d_lock was dropped in prune_one_dentry() */
+                rcu_read_unlock();
-                cond_resched_lock(&dcache_lock);
+                try_prune_one_dentry(dentry);
+                rcu_read_lock();
        }
+        rcu_read_unlock();
 }
 /**
@@ -495,42 +743,44 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
        LIST_HEAD(tmp);
        int cnt = *count;
-        spin_lock(&dcache_lock);
+relock:
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                dentry = list_entry(sb->s_dentry_lru.prev,
                                struct dentry, d_lru);
                BUG_ON(dentry->d_sb != sb);
+                if (!spin_trylock(&dentry->d_lock)) {
+                        spin_unlock(&dcache_lru_lock);
+                        cpu_relax();
+                        goto relock;
+                }
                /*
                 * If we are honouring the DCACHE_REFERENCED flag and the
                 * dentry has this flag set, don't free it.  Clear the flag
                 * and put it back on the LRU.
                 */
-                if (flags & DCACHE_REFERENCED) {
+                if (flags & DCACHE_REFERENCED &&
-                        spin_lock(&dentry->d_lock);
+                                dentry->d_flags & DCACHE_REFERENCED) {
-                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                        dentry->d_flags &= ~DCACHE_REFERENCED;
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                        list_move(&dentry->d_lru, &referenced);
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                                cond_resched_lock(&dcache_lock);
-                                continue;
-                        }
                        spin_unlock(&dentry->d_lock);
+                } else {
+                        list_move_tail(&dentry->d_lru, &tmp);
+                        spin_unlock(&dentry->d_lock);
+                        if (!--cnt)
+                                break;
                }
+                cond_resched_lock(&dcache_lru_lock);
-                list_move_tail(&dentry->d_lru, &tmp);
-                if (!--cnt)
-                        break;
-                cond_resched_lock(&dcache_lock);
        }
-        *count = cnt;
-        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
+        shrink_dentry_list(&tmp);
+        *count = cnt;
 }
 /**
@@ -546,13 +796,12 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        int unused = dentry_stat.nr_unused;
        int prune_ratio;
        int pruned;
        if (unused == 0 || count == 0)
                return;
-        spin_lock(&dcache_lock);
        if (count >= unused)
                prune_ratio = 1;
        else
@@ -589,11 +838,9 @@ static void prune_dcache(int count)
                if (down_read_trylock(&sb->s_umount)) {
                        if ((sb->s_root != NULL) &&
                            (!list_empty(&sb->s_dentry_lru))) {
-                                spin_unlock(&dcache_lock);
                                __shrink_dcache_sb(sb, &w_count,
                                                DCACHE_REFERENCED);
                                pruned -= w_count;
-                                spin_lock(&dcache_lock);
                        }
                        up_read(&sb->s_umount);
                }
@@ -609,7 +856,6 @@ static void prune_dcache(int count)
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
-        spin_unlock(&dcache_lock);
 }
 /**
@@ -623,12 +869,14 @@ void shrink_dcache_sb(struct super_block *sb)
 {
        LIST_HEAD(tmp);
-        spin_lock(&dcache_lock);
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                list_splice_init(&sb->s_dentry_lru, &tmp);
+                spin_unlock(&dcache_lru_lock);
                shrink_dentry_list(&tmp);
+                spin_lock(&dcache_lru_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -645,10 +893,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        BUG_ON(!IS_ROOT(dentry));
        /* detach this root from the system */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        dentry_lru_del(dentry);
        __d_drop(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        for (;;) {
                /* descend to the first leaf in the current subtree */
@@ -657,14 +905,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        /* this is a branch with children - detach all of them
                         * from the system in one go */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
+                                spin_lock_nested(&loop->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
                                dentry_lru_del(loop);
                                __d_drop(loop);
-                                cond_resched_lock(&dcache_lock);
+                                spin_unlock(&loop->d_lock);
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                        /* move to the first child */
                        dentry = list_entry(dentry->d_subdirs.next,
@@ -676,7 +926,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                do {
                        struct inode *inode;
-                        if (atomic_read(&dentry->d_count) != 0) {
+                        if (dentry->d_count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
                                       " still in use (%d)"
@@ -685,20 +935,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                                       dentry->d_inode ?
                                       dentry->d_inode->i_ino : 0UL,
                                       dentry->d_name.name,
-                                       atomic_read(&dentry->d_count),
+                                       dentry->d_count,
                                       dentry->d_sb->s_type->name,
                                       dentry->d_sb->s_id);
                                BUG();
                        }
-                        if (IS_ROOT(dentry))
+                        if (IS_ROOT(dentry)) {
                                parent = NULL;
-                        else {
+                                list_del(&dentry->d_u.d_child);
+                        } else {
                                parent = dentry->d_parent;
-                                atomic_dec(&parent->d_count);
+                                spin_lock(&parent->d_lock);
+                                parent->d_count--;
+                                list_del(&dentry->d_u.d_child);
+                                spin_unlock(&parent->d_lock);
                        }
-                        list_del(&dentry->d_u.d_child);
                        detached++;
                        inode = dentry->d_inode;
@@ -728,8 +981,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 /*
 * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ * - we don't need to use dentry->d_lock because:
- *   removing the dentry from the system lists and hashes because:
 *   - the superblock is detached from all mountings and open files, so the
 *     dentry trees will not be rearranged by the VFS
 *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -746,11 +998,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
        dentry = sb->s_root;
        sb->s_root = NULL;
-        atomic_dec(&dentry->d_count);
+        spin_lock(&dentry->d_lock);
+        dentry->d_count--;
+        spin_unlock(&dentry->d_lock);
        shrink_dcache_for_umount_subtree(dentry);
-        while (!hlist_empty(&sb->s_anon)) {
+        while (!hlist_bl_empty(&sb->s_anon)) {
-                dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+                dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
                shrink_dcache_for_umount_subtree(dentry);
        }
 }
@@ -768,15 +1022,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
 * Return true if the parent or its subdirectories contain
 * a mount point
 */
- 
 int have_submounts(struct dentry *parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
-        spin_lock(&dcache_lock);
        if (d_mountpoint(parent))
                goto positive;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -784,27 +1043,65 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* Have we found a mount point ? */
-                if (d_mountpoint(dentry))
+                if (d_mountpoint(dentry)) {
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&this_parent->d_lock);
                        goto positive;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 0; /* No mount points found in tree */
 positive:
-        spin_unlock(&dcache_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 1;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -824,11 +1121,16 @@ EXPORT_SYMBOL(have_submounts);
 */
 static int select_parent(struct dentry * parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
        int found = 0;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -837,11 +1139,13 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
                        dentry_lru_move_tail(dentry);
                        found++;
                } else {
@@ -853,28 +1157,63 @@ resume:
                 * ensures forward progress). We'll be coming back to find
                 * the rest.
                 */
-                if (found && need_resched())
+                if (found && need_resched()) {
+                        spin_unlock(&dentry->d_lock);
                        goto out;
+                }
                /*
                 * Descend a level if the d_subdirs list is non-empty.
                 */
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                        (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return found;
+rename_retry:
+        if (found)
+                return found;
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -908,16 +1247,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
-        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -960,38 +1296,52 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;
-        atomic_set(&dentry->d_count, 1);
+        dentry->d_count = 1;
        dentry->d_flags = DCACHE_UNHASHED;
        spin_lock_init(&dentry->d_lock);
+        seqcount_init(&dentry->d_seq);
        dentry->d_inode = NULL;
        dentry->d_parent = NULL;
        dentry->d_sb = NULL;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
-        dentry->d_mounted = 0;
+        INIT_HLIST_BL_NODE(&dentry->d_hash);
-        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_LIST_HEAD(&dentry->d_alias);
+        INIT_LIST_HEAD(&dentry->d_u.d_child);
        if (parent) {
-                dentry->d_parent = dget(parent);
+                spin_lock(&parent->d_lock);
+                /*
+                 * don't need child lock because it is not subject
+                 * to concurrency here
+                 */
+                __dget_dlock(parent);
+                dentry->d_parent = parent;
                dentry->d_sb = parent->d_sb;
-        } else {
-                INIT_LIST_HEAD(&dentry->d_u.d_child);
-        }
-        spin_lock(&dcache_lock);
-        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        spin_unlock(&dcache_lock);
+                spin_unlock(&parent->d_lock);
+        }
-        percpu_counter_inc(&nr_dentry);
+        this_cpu_inc(nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+        struct dentry *dentry = d_alloc(NULL, name);
+        if (dentry) {
+                dentry->d_sb = sb;
+                dentry->d_parent = dentry;
+                dentry->d_flags |= DCACHE_DISCONNECTED;
+        }
+        return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
        struct qstr q;
@@ -1003,12 +1353,36 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
-/* the caller must hold dcache_lock */
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+        BUG_ON(dentry->d_op);
+        BUG_ON(dentry->d_flags & (DCACHE_OP_HASH        |
+                                DCACHE_OP_COMPARE       |
+                                DCACHE_OP_REVALIDATE    |
+                                DCACHE_OP_DELETE ));
+        dentry->d_op = op;
+        if (!op)
+                return;
+        if (op->d_hash)
+                dentry->d_flags |= DCACHE_OP_HASH;
+        if (op->d_compare)
+                dentry->d_flags |= DCACHE_OP_COMPARE;
+        if (op->d_revalidate)
+                dentry->d_flags |= DCACHE_OP_REVALIDATE;
+        if (op->d_delete)
+                dentry->d_flags |= DCACHE_OP_DELETE;
+}
+EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
+        spin_lock(&dentry->d_lock);
        if (inode)
                list_add(&dentry->d_alias, &inode->i_dentry);
        dentry->d_inode = inode;
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
 }
@@ -1030,9 +1404,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1069,15 +1445,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
                struct qstr *qstr = &alias->d_name;
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
                if (qstr->hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
-                if (qstr->len != len)
+                if (dentry_cmp(qstr->name, qstr->len, name, len))
                        continue;
-                if (memcmp(qstr->name, name, len))
+                __dget(alias);
-                        continue;
-                dget_locked(alias);
                return alias;
        }
@@ -1091,9 +1470,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        result = __d_instantiate_unique(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        if (!result) {
                security_d_instantiate(entry, inode);
@@ -1134,14 +1515,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
-static inline struct hlist_head *d_hash(struct dentry *parent,
-                                        unsigned long hash)
-{
-        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
-        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
-        return dentry_hashtable + (hash & D_HASHMASK);
-}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1182,10 +1555,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
        tmp->d_parent = tmp; /* make sure dput doesn't croak */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        res = __d_find_alias(inode, 0);
        if (res) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                dput(tmp);
                goto out_iput;
        }
@@ -1195,12 +1569,14 @@ struct dentry *d_obtain_alias(struct inode *inode)
        tmp->d_sb = inode->i_sb;
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
-        tmp->d_flags &= ~DCACHE_UNHASHED;
        list_add(&tmp->d_alias, &inode->i_dentry);
-        hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+        bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        tmp->d_flags &= ~DCACHE_UNHASHED;
+        hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
+        spin_unlock(&inode->i_lock);
-        spin_unlock(&dcache_lock);
        return tmp;
 out_iput:
@@ -1230,18 +1606,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        struct dentry *new = NULL;
        if (inode && S_ISDIR(inode->i_mode)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                new = __d_find_alias(inode, 1);
                if (new) {
                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        d_move(new, dentry);
                        iput(inode);
                } else {
-                        /* already taking dcache_lock, so d_add() by hand */
+                        /* already taking inode->i_lock, so d_add() by hand */
                        __d_instantiate(dentry, inode);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
@@ -1314,10 +1690,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * Negative dentry: instantiate it unless the inode is a directory and
         * already has a dentry.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
                __d_instantiate(found, inode);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                security_d_instantiate(found, inode);
                return found;
        }
@@ -1327,8 +1703,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * reference to it, move it in place and use it.
         */
        new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        dget_locked(new);
+        __dget(new);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        security_d_instantiate(found, inode);
        d_move(new, found);
        iput(inode);
@@ -1342,6 +1718,112 @@ err_out:
 EXPORT_SYMBOL(d_add_ci);
 /**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+                                unsigned *seq, struct inode **inode)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
+        struct dentry *dentry;
+        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Carefully use d_seq when comparing a candidate dentry, to avoid
+         * races with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+                struct inode *i;
+                const char *tname;
+                int tlen;
+                if (dentry->d_name.hash != hash)
+                        continue;
+seqretry:
+                *seq = read_seqcount_begin(&dentry->d_seq);
+                if (dentry->d_parent != parent)
+                        continue;
+                if (d_unhashed(dentry))
+                        continue;
+                tlen = dentry->d_name.len;
+                tname = dentry->d_name.name;
+                i = dentry->d_inode;
+                prefetch(tname);
+                if (i)
+                        prefetch(i);
+                /*
+                 * This seqcount check is required to ensure name and
+                 * len are loaded atomically, so as not to walk off the
+                 * edge of memory when walking. If we could load this
+                 * atomically some other way, we could drop this check.
+                 */
+                if (read_seqcount_retry(&dentry->d_seq, *seq))
+                        goto seqretry;
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, *inode,
+                                                dentry, i,
+                                                tlen, tname, name))
+                                continue;
+                } else {
+                        if (dentry_cmp(tname, tlen, str, len))
+                                continue;
+                }
+                /*
+                 * No extra seqcount check is required after the name
+                 * compare. The caller must perform a seqcount check in
+                 * order to do anything useful with the returned dentry
+                 * anyway.
+                 */
+                *inode = i;
+                return dentry;
+        }
+        return NULL;
+}
+/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1352,10 +1834,10 @@ EXPORT_SYMBOL(d_add_ci);
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-        struct dentry * dentry = NULL;
+        struct dentry *dentry;
-        unsigned long seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&rename_lock);
@@ -1367,7 +1849,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
-/*
+/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1382,17 +1864,24 @@ EXPORT_SYMBOL(d_lookup);
 *
 * __d_lookup callers must be commented.
 */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct hlist_head *head = d_hash(parent,hash);
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
        struct dentry *found = NULL;
-        struct hlist_node *node;
        struct dentry *dentry;
        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1407,25 +1896,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
         */
        rcu_read_lock();
        
-        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
-                struct qstr *qstr;
+                const char *tname;
+                int tlen;
                if (dentry->d_name.hash != hash)
                        continue;
-                if (dentry->d_parent != parent)
-                        continue;
                spin_lock(&dentry->d_lock);
-                /*
-                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things. Don't bother checking the hash because
-                 * we're about to compare the whole name anyway.
-                 */
                if (dentry->d_parent != parent)
                        goto next;
-                /* non-existing due to RCU? */
                if (d_unhashed(dentry))
                        goto next;
@@ -1433,18 +1913,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
                 */
-                qstr = &dentry->d_name;
+                tlen = dentry->d_name.len;
-                if (parent->d_op && parent->d_op->d_compare) {
+                tname = dentry->d_name.name;
-                        if (parent->d_op->d_compare(parent, qstr, name))
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, parent->d_inode,
+                                                dentry, dentry->d_inode,
+                                                tlen, tname, name))
                                goto next;
                } else {
-                        if (qstr->len != len)
+                        if (dentry_cmp(tname, tlen, str, len))
-                                goto next;
-                        if (memcmp(qstr->name, str, len))
                                goto next;
                }
-                atomic_inc(&dentry->d_count);
+                dentry->d_count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
@@ -1473,8 +1954,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(name->name, name->len);
-        if (dir->d_op && dir->d_op->d_hash) {
+        if (dir->d_flags & DCACHE_OP_HASH) {
-                if (dir->d_op->d_hash(dir, name) < 0)
+                if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
                        goto out;
        }
        dentry = d_lookup(dir, name);
@@ -1483,34 +1964,32 @@ out:
 }
 /**
- * d_validate - verify dentry provided from insecure source
+ * d_validate - verify dentry provided from insecure source (deprecated)
 * @dentry: The dentry alleged to be valid child of @dparent
 * @dparent: The parent dentry (known to be valid)
 *
 * An insecure source has sent us a dentry, here we verify it and dget() it.
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
 */
-int d_validate(struct dentry *dentry, struct dentry *parent)
+int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+        struct dentry *child;
-        struct hlist_node *node;
-        struct dentry *d;
-        /* Check whether the ptr might be valid at all.. */
-        if (!kmem_ptr_validate(dentry_cache, dentry))
-                return 0;
-        if (dentry->d_parent != parent)
-                return 0;
-        rcu_read_lock();
+        spin_lock(&dparent->d_lock);
-        hlist_for_each_entry_rcu(d, node, head, d_hash) {
+        list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
-                if (d == dentry) {
+                if (dentry == child) {
-                        dget(dentry);
+                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                        __dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&dparent->d_lock);
                        return 1;
                }
        }
-        rcu_read_unlock();
+        spin_unlock(&dparent->d_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1538,16 +2017,23 @@ EXPORT_SYMBOL(d_validate);
 
 void d_delete(struct dentry * dentry)
 {
+        struct inode *inode;
        int isdir = 0;
        /*
         * Are we the only user?
         */
-        spin_lock(&dcache_lock);
+again:
        spin_lock(&dentry->d_lock);
-        isdir = S_ISDIR(dentry->d_inode->i_mode);
+        inode = dentry->d_inode;
-        if (atomic_read(&dentry->d_count) == 1) {
+        isdir = S_ISDIR(inode->i_mode);
+        if (dentry->d_count == 1) {
+                if (inode && !spin_trylock(&inode->i_lock)) {
+                        spin_unlock(&dentry->d_lock);
+                        cpu_relax();
+                        goto again;
+                }
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-                dentry_iput(dentry);
+                dentry_unlink_inode(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
        }
@@ -1556,17 +2042,18 @@ void d_delete(struct dentry * dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
 {
+        BUG_ON(!d_unhashed(entry));
+        spin_lock_bucket(b);
        entry->d_flags &= ~DCACHE_UNHASHED;
-        hlist_add_head_rcu(&entry->d_hash, list);
+        hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+        spin_unlock_bucket(b);
 }
 static void _d_rehash(struct dentry * entry)
@@ -1583,25 +2070,39 @@ static void _d_rehash(struct dentry * entry)
 
 void d_rehash(struct dentry * entry)
 {
-        spin_lock(&dcache_lock);
        spin_lock(&entry->d_lock);
        _d_rehash(entry);
        spin_unlock(&entry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
-/*
+/**
- * When switching names, the actual string doesn't strictly have to
+ * dentry_update_name_case - update case insensitive dentry with a new name
- * be preserved in the target - because we're dropping the target
+ * @dentry: dentry to be updated
- * anyway. As such, we can just do a simple memcpy() to copy over
+ * @name: new name
- * the new name before we switch.
 *
- * Note that we have to be a lot more careful about getting the hash
+ * Update a case insensitive dentry with new case of name.
- * switched - we have to switch the hash value properly even if it
+ *
- * then no longer matches the actual (corrupted) string of the target.
+ * dentry must have been returned by d_lookup with name @name. Old and new
- * The hash value has to match the hash queue that the dentry is on..
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
 */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+        spin_lock(&dentry->d_lock);
+        write_seqcount_begin(&dentry->d_seq);
+        memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+        write_seqcount_end(&dentry->d_seq);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
 static void switch_names(struct dentry *dentry, struct dentry *target)
 {
        if (dname_external(target)) {
@@ -1643,54 +2144,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
        swap(dentry->d_name.len, target->d_name.len);
 }
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+        /*
+         * XXXX: do we really need to take target->d_lock?
+         */
+        if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+                spin_lock(&target->d_parent->d_lock);
+        else {
+                if (d_ancestor(dentry->d_parent, target->d_parent)) {
+                        spin_lock(&dentry->d_parent->d_lock);
+                        spin_lock_nested(&target->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                } else {
+                        spin_lock(&target->d_parent->d_lock);
+                        spin_lock_nested(&dentry->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                }
+        }
+        if (target < dentry) {
+                spin_lock_nested(&target->d_lock, 2);
+                spin_lock_nested(&dentry->d_lock, 3);
+        } else {
+                spin_lock_nested(&dentry->d_lock, 2);
+                spin_lock_nested(&target->d_lock, 3);
+        }
+}
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+                                        struct dentry *target)
+{
+        if (target->d_parent != dentry->d_parent)
+                spin_unlock(&dentry->d_parent->d_lock);
+        if (target->d_parent != target)
+                spin_unlock(&target->d_parent->d_lock);
+}
 /*
- * We cannibalize "target" when moving dentry on top of it,
+ * When switching names, the actual string doesn't strictly have to
- * because it's going to be thrown away anyway. We could be more
+ * be preserved in the target - because we're dropping the target
- * polite about it, though.
+ * anyway. As such, we can just do a simple memcpy() to copy over
- *
+ * the new name before we switch.
- * This forceful removal will result in ugly /proc output if
+ *
- * somebody holds a file open that got deleted due to a rename.
+ * Note that we have to be a lot more careful about getting the hash
- * We could be nicer about the deleted file, and let it show
+ * switched - we have to switch the hash value properly even if it
- * up under the name it had before it was deleted rather than
+ * then no longer matches the actual (corrupted) string of the target.
- * under the original name of the file that was moved on top of it.
+ * The hash value has to match the hash queue that the dentry is on..
 */
- 
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way.
 */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
-        struct hlist_head *list;
        if (!dentry->d_inode)
                printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+        BUG_ON(d_ancestor(dentry, target));
+        BUG_ON(d_ancestor(target, dentry));
        write_seqlock(&rename_lock);
-        /*
-         * XXXX: do we really need to take target->d_lock?
-         */
-        if (target < dentry) {
-                spin_lock(&target->d_lock);
-                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-        } else {
-                spin_lock(&dentry->d_lock);
-                spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-        }
-        /* Move the dentry to the target hash queue, if on different bucket */
+        dentry_lock_for_move(dentry, target);
-        if (d_unhashed(dentry))
-                goto already_unhashed;
-        hlist_del_rcu(&dentry->d_hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&target->d_seq);
-already_unhashed:
+        /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-        list = d_hash(target->d_parent, target->d_name.hash);
-        __d_rehash(dentry, list);
+        /*
+         * Move the dentry to the target hash queue. Don't bother checking
+         * for the same hash queue because of how unlikely it is.
+         */
+        __d_drop(dentry);
+        __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
        /* Unhash the target: dput() will then get rid of it */
        __d_drop(target);
@@ -1715,27 +2246,16 @@ already_unhashed:
        }
        list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+        write_seqcount_end(&target->d_seq);
+        write_seqcount_end(&dentry->d_seq);
+        dentry_unlock_parents_for_move(dentry, target);
        spin_unlock(&target->d_lock);
        fsnotify_d_move(dentry);
        spin_unlock(&dentry->d_lock);
        write_sequnlock(&rename_lock);
 }
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-        spin_lock(&dcache_lock);
-        d_move_locked(dentry, target);
-        spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 /**
@@ -1761,13 +2281,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
-static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+static struct dentry *__d_unalias(struct inode *inode,
-        __releases(dcache_lock)
+                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
        struct dentry *ret;
@@ -1790,10 +2310,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-        d_move_locked(alias, dentry);
+        d_move(alias, dentry);
        ret = alias;
 out_err:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (m2)
                mutex_unlock(m2);
        if (m1)
@@ -1804,17 +2324,23 @@ out_err:
 /*
 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
 * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
 */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
        struct dentry *dparent, *aparent;
-        switch_names(dentry, anon);
+        dentry_lock_for_move(anon, dentry);
-        swap(dentry->d_name.hash, anon->d_name.hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&anon->d_seq);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
+        switch_names(dentry, anon);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dentry->d_parent = (aparent == anon) ? dentry : aparent;
        list_del(&dentry->d_u.d_child);
        if (!IS_ROOT(dentry))
@@ -1829,6 +2355,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        else
                INIT_LIST_HEAD(&anon->d_u.d_child);
+        write_seqcount_end(&dentry->d_seq);
+        write_seqcount_end(&anon->d_seq);
+        dentry_unlock_parents_for_move(anon, dentry);
+        spin_unlock(&dentry->d_lock);
+        /* anon->d_lock still locked, returns locked */
        anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
@@ -1846,14 +2379,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        BUG_ON(!d_unhashed(dentry));
-        spin_lock(&dcache_lock);
        if (!inode) {
                actual = dentry;
                __d_instantiate(dentry, NULL);
-                goto found_lock;
+                d_rehash(actual);
+                goto out_nolock;
        }
+        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -1864,13 +2398,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        /* Is this an anonymous mountpoint that we could splice
                         * into our tree? */
                        if (IS_ROOT(alias)) {
-                                spin_lock(&alias->d_lock);
                                __d_materialise_dentry(dentry, alias);
                                __d_drop(alias);
                                goto found;
                        }
                        /* Nope, but we must(!) avoid directory aliasing */
-                        actual = __d_unalias(dentry, alias);
+                        actual = __d_unalias(inode, dentry, alias);
                        if (IS_ERR(actual))
                                dput(alias);
                        goto out_nolock;
@@ -1881,15 +2414,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        actual = __d_instantiate_unique(dentry, inode);
        if (!actual)
                actual = dentry;
-        else if (unlikely(!d_unhashed(actual)))
+        else
-                goto shouldnt_be_hashed;
+                BUG_ON(!d_unhashed(actual));
-found_lock:
        spin_lock(&actual->d_lock);
 found:
        _d_rehash(actual);
        spin_unlock(&actual->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 out_nolock:
        if (actual == dentry) {
                security_d_instantiate(dentry, inode);
@@ -1898,10 +2430,6 @@ out_nolock:
        iput(inode);
        return actual;
-shouldnt_be_hashed:
-        spin_unlock(&dcache_lock);
-        BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -1928,7 +2456,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -1956,7 +2484,9 @@ static int prepend_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
+                spin_lock(&dentry->d_lock);
                error = prepend_name(buffer, buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
                if (!error)
                        error = prepend(buffer, buflen, "/", 1);
                if (error)
@@ -2012,9 +2542,9 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (error)
                return ERR_PTR(error);
@@ -2076,12 +2606,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (error)
                res = ERR_PTR(error);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        return res;
 }
@@ -2107,12 +2637,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        if (error)
                res =  ERR_PTR(error);
@@ -2144,7 +2674,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *end = buf + buflen;
        char *retval;
@@ -2158,10 +2688,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                int error;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                spin_lock(&dentry->d_lock);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                error = prepend_name(&end, &buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
+                if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
                        goto Elong;
                retval = end;
@@ -2171,14 +2704,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
-EXPORT_SYMBOL(__dentry_path);
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+        char *retval;
+        write_seqlock(&rename_lock);
+        retval = __dentry_path(dentry, buf, buflen);
+        write_sequnlock(&rename_lock);
+        return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
 char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *p = NULL;
        char *retval;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2186,12 +2730,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
 Elong:
-        spin_unlock(&dcache_lock);
        return ERR_PTR(-ENAMETOOLONG);
 }
@@ -2225,7 +2768,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        get_fs_root_and_pwd(current->fs, &root, &pwd);
        error = -ENOENT;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
@@ -2234,7 +2777,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
                if (error)
                        goto out;
@@ -2253,8 +2796,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
-        } else
+        } else {
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
+        }
 out:
        path_put(&pwd);
@@ -2282,25 +2826,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
        int result;
-        unsigned long seq;
+        unsigned seq;
        if (new_dentry == old_dentry)
                return 1;
-        /*
-         * Need rcu_readlock to protect against the d_parent trashing
-         * due to d_move
-         */
-        rcu_read_lock();
        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
+                /*
+                 * Need rcu_readlock to protect against the d_parent trashing
+                 * due to d_move
+                 */
+                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = 1;
                else
                        result = 0;
+                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));
-        rcu_read_unlock();
        return result;
 }
@@ -2332,10 +2876,15 @@ EXPORT_SYMBOL(path_is_under);
 void d_genocide(struct dentry *root)
 {
-        struct dentry *this_parent = root;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = root;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -2343,21 +2892,62 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                if (d_unhashed(dentry)||!dentry->d_inode)
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                if (d_unhashed(dentry) || !dentry->d_inode) {
+                        spin_unlock(&dentry->d_lock);
                        continue;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
-                atomic_dec(&dentry->d_count);
+                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+                        dentry->d_flags |= DCACHE_GENOCIDE;
+                        dentry->d_count--;
+                }
+                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                atomic_dec(&this_parent->d_count);
+                struct dentry *child;
-                this_parent = this_parent->d_parent;
+                tmp = this_parent->d_parent;
+                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+                        this_parent->d_flags |= DCACHE_GENOCIDE;
+                        this_parent->d_count--;
+                }
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
+        return;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -2411,7 +3001,7 @@ static void __init dcache_init_early(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY,
@@ -2420,16 +3010,13 @@ static void __init dcache_init_early(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 static void __init dcache_init(void)
 {
        int loop;
-        percpu_counter_init(&nr_dentry, 0);
-        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
@@ -2446,7 +3033,7 @@ static void __init dcache_init(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        0,
@@ -2455,7 +3042,7 @@ static void __init dcache_init(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 /* SLAB cache for __getname() consumers */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622..9c64ae9e4c1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
 struct cbuf {
        unsigned int base;
        unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!con)
+                return;
+        clear_bit(SOCK_NOSPACE, &con->sock->flags);
+        if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+                con->sock->sk->sk_write_pending--;
+                clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+        }
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock = NULL;
+        int one = 1;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
                                   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
                goto create_out;
        }
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                                   (char *)&one, sizeof(one));
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
+        int count = 0;
        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
                        ret = kernel_sendpage(con->sock, e->page, offset, len,
                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
+                                if (ret == -EAGAIN &&
+                                    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+                                    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+                                        /* Notify TCP that we're limited by the
+                                         * application window size.
+                                         */
+                                        set_bit(SOCK_NOSPACE, &con->sock->flags);
+                                        con->sock->sk->sk_write_pending++;
+                                }
                                cond_resched();
                                goto out;
                        }
                        if (ret <= 0)
                                goto send_error;
                }
-                        /* Don't starve people filling buffers */
+                /* Don't starve people filling buffers */
+                if (++count >= MAX_SEND_MSG_COUNT) {
                        cond_resched();
+                        count = 0;
+                }
                spin_lock(&con->writequeue_lock);
                e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 static int work_start(void)
 {
-        int error;
+        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-        recv_workqueue = create_workqueue("dlm_recv");
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        error = IS_ERR(recv_workqueue);
+        if (!recv_workqueue) {
-        if (error) {
+                log_print("can't start dlm_recv");
-                log_print("can't start dlm_recv %d", error);
+                return -ENOMEM;
-                return error;
        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
+        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-        error = IS_ERR(send_workqueue);
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        if (error) {
+        if (!send_workqueue) {
-                log_print("can't start dlm_send %d", error);
+                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
-                return error;
+                return -ENOMEM;
        }
        return 0;
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f7..6fc4f319b55 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
 */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct dentry *lower_dentry;
-        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct vfsmount *lower_mnt;
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
        int rc = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
        dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d6276..337352a9475 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -260,7 +260,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                   ecryptfs_dentry->d_parent));
        lower_inode = lower_dentry->d_inode;
        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
-        BUG_ON(!atomic_read(&lower_dentry->d_count));
+        BUG_ON(!lower_dentry->d_count);
        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
@@ -441,7 +441,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct qstr lower_name;
        int rc = 0;
-        ecryptfs_dentry->d_op = &ecryptfs_dops;
+        d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
        if ((ecryptfs_dentry->d_name.len == 1
             && !strcmp(ecryptfs_dentry->d_name.name, "."))
            || (ecryptfs_dentry->d_name.len == 2
@@ -454,7 +454,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = ecryptfs_dentry->d_name.hash;
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -489,7 +489,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -980,8 +980,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a9dbd62518e..35103867537 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -189,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        if (special_file(lower_inode->i_mode))
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
-        dentry->d_op = &ecryptfs_dops;
+        d_set_d_op(dentry, &ecryptfs_dops);
        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
@@ -594,7 +594,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                deactivate_locked_super(s);
                goto out;
        }
-        s->s_root->d_op = &ecryptfs_dops;
+        d_set_d_op(s->s_root, &ecryptfs_dops);
        s->s_root->d_sb = s;
        s->s_root->d_parent = s->s_root;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 2720178b771..3042fe123a3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -62,6 +62,16 @@ out:
        return inode;
 }
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
 /**
 * ecryptfs_destroy_inode
 * @inode: The ecryptfs inode
@@ -88,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+        call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652c..0f31acb0131 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
+static void efs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, efs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067..cc8a9b7d606 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
 * Configuration options available inside /proc/sys/fs/epoll/
 */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 /*
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 #include <linux/sysctl.h>
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 ctl_table epoll_table[] = {
        {
                .procname       = "max_user_watches",
                .data           = &max_user_watches,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(max_user_watches),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = &zero,
+                .extra2         = &long_max,
        },
        { }
 };
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        /* At this point it is safe to free the eventpoll item */
        kmem_cache_free(epi_cache, epi);
-        atomic_dec(&ep->user->epoll_watches);
+        atomic_long_dec(&ep->user->epoll_watches);
        return 0;
 }
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
        int error, revents, pwake = 0;
        unsigned long flags;
+        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
-        if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+        user_watches = atomic_long_read(&ep->user->epoll_watches);
-                     max_user_watches))
+        if (unlikely(user_watches >= max_user_watches))
                return -ENOSPC;
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
                return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        spin_unlock_irqrestore(&ep->lock, flags);
-        atomic_inc(&ep->user->epoll_watches);
+        atomic_long_inc(&ep->user->epoll_watches);
        /* We have to call this outside the lock */
        if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
+        BUG_ON(max_user_watches < 0);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e045..8c6c4669b38 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
+static void exofs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
 /*
 * Remove an inode from the cache
 */
 static void exofs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+        call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 /*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51b304056f1..4b6825740dd 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
                void *context)
 {
        struct dentry *dentry, *toput = NULL;
+        struct inode *inode;
        if (acceptable(context, result))
                return result;
-        spin_lock(&dcache_lock);
+        inode = result->d_inode;
-        list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
+        spin_lock(&inode->i_lock);
-                dget_locked(dentry);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                spin_unlock(&dcache_lock);
+                dget(dentry);
+                spin_unlock(&inode->i_lock);
                if (toput)
                        dput(toput);
                if (dentry != result && acceptable(context, dentry)) {
                        dput(result);
                        return dentry;
                }
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                toput = dentry;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (toput)
                dput(toput);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bad..7b4180554a6 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac4..c939b7b1209 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206a..47cda410b54 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 typedef struct ext2_dir_entry_2 ext2_dirent;
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == EXT2_MAX_REC_LEN)
                return 1 << 16;
+#endif
        return len;
 }
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == (1 << 16))
                return cpu_to_le16(EXT2_MAX_REC_LEN);
        else
                BUG_ON(len > (1 << 16));
+#endif
        return cpu_to_le16(len);
 }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
                p = (ext2_dirent *)(kaddr + offs);
                rec_len = ext2_rec_len_from_disk(p->rec_len);
-                if (rec_len < EXT2_DIR_REC_LEN(1))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
                        goto Eshort;
-                if (rec_len & 3)
+                if (unlikely(rec_len & 3))
                        goto Ealign;
-                if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
                        goto Enamelen;
-                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
                        goto Espan;
-                if (le32_to_cpu(p->inode) > max_inumber)
+                if (unlikely(le32_to_cpu(p->inode) > max_inumber))
                        goto Einumber;
        }
        if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e329..2e1d8341d82 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext2_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d7..7731695e65d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
-void ext2_error (struct super_block * sb, const char * function,
+void ext2_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
+static void ext2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ext2_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be327..c2e4dce984d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        /* Check the remaining name entries */
-        while (!IS_LAST_ENTRY(entry)) {
-                struct ext2_xattr_entry *next =
-                        EXT2_XATTR_NEXT(entry);
-                if ((char *)next >= end)
-                        goto bad_block;
-                entry = next;
-        }
        if (ext2_xattr_cache_insert(bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
 * ext2_xattr_set()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe21218..e4fa49e6c53 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de..5faf8048e90 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db2264942..045995c8ce5 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+        ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        if (offsetp)
+                *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+        if (blockgrpp)
+                *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
 /**
 * ext3_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
        return ext3_bg_num_gdb_meta(sb,group);
 }
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @group:              allocation group to trim
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @gdp:                allocation group description structure
+ * @minblocks:          minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                ext3_grpblk_t minblocks)
+{
+        handle_t *handle;
+        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+        ext3_fsblk_t discard_block;
+        struct ext3_sb_info *sbi;
+        struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+        struct ext3_group_desc *gdp;
+        int err = 0, ret = 0;
+        /*
+         * We will update one block bitmap, and one group descriptor
+         */
+        handle = ext3_journal_start_sb(sb, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (!bitmap_bh) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext3_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto err_out;
+        gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+        if (!gdp) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        err = ext3_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto err_out;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        sbi = EXT3_SB(sb);
+         /* Walk through the whole group */
+        while (start < max) {
+                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+                if (start < 0)
+                        break;
+                next = start;
+                /*
+                 * Allocate contiguous free extents by setting bits in the
+                 * block bitmap
+                 */
+                while (next < max
+                        && claim_block(sb_bgl_lock(sbi, group),
+                                        next, bitmap_bh)) {
+                        next++;
+                }
+                 /* We did not claim any blocks */
+                if (next == start)
+                        continue;
+                discard_block = (ext3_fsblk_t)start +
+                                ext3_group_first_block_no(sb, group);
+                /* Update counters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                /* Do not issue a TRIM on extents smaller than minblocks */
+                if ((next - start) < minblocks)
+                        goto free_extent;
+                 /* Send the TRIM command down to the device */
+                err = sb_issue_discard(sb, discard_block, next - start,
+                                       GFP_NOFS, 0);
+                count += (next - start);
+free_extent:
+                freed = 0;
+                /*
+                 * Clear bits in the bitmap
+                 */
+                for (bit = start; bit < next; bit++) {
+                        BUFFER_TRACE(bitmap_bh, "clear bit");
+                        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+                                                bit, bitmap_bh->b_data)) {
+                                ext3_error(sb, __func__,
+                                        "bit already cleared for block "E3FSBLK,
+                                         (unsigned long)bit);
+                                BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                        } else {
+                                freed++;
+                        }
+                }
+                /* Update couters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+                start = next;
+                if (err < 0) {
+                        if (err != -EOPNOTSUPP)
+                                ext3_warning(sb, __func__, "Discard command "
+                                             "returned error %d\n", err);
+                        break;
+                }
+                if (fatal_signal_pending(current)) {
+                        err = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+                /* No more suitable extents */
+                if ((free_blocks - count) < minblocks)
+                        break;
+        }
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+        if (!err)
+                err = ret;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+        ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+        if (!err)
+                err = ret;
+        ext3_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+err_out:
+        if (err)
+                count = err;
+        ext3_journal_stop(handle);
+        brelse(bitmap_bh);
+        return count;
+}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @start:              First Byte to trim
+ * @len:                number of Bytes to trim from start
+ * @minlen:             minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        ext3_grpblk_t last_block, first_block, free_blocks;
+        unsigned long first_group, last_group;
+        unsigned long group, ngroups;
+        struct ext3_group_desc *gdp;
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        uint64_t start, len, minlen, trimmed;
+        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start >= max_blks)
+                goto out;
+        if (start < le32_to_cpu(es->s_first_data_block)) {
+                len -= le32_to_cpu(es->s_first_data_block) - start;
+                start = le32_to_cpu(es->s_first_data_block);
+        }
+        if (start + len > max_blks)
+                len = max_blks - start;
+        ngroups = EXT3_SB(sb)->s_groups_count;
+        smp_rmb();
+        /* Determine first and last group to examine based on start and len */
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT3_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                gdp = ext3_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        break;
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                if (free_blocks < minlen)
+                        continue;
+                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                ret = ext3_trim_all_free(sb, group, first_block,
+                                        last_block, minlen);
+                if (ret < 0)
+                        break;
+                trimmed += ret;
+                first_block = 0;
+        }
+        if (ret >= 0)
+                ret = 0;
+out:
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf..34f0a072b93 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        const char * error_msg = NULL;
        const int rlen = ext3_rec_len_from_disk(de->rec_len);
-        if (rlen < EXT3_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                ext3_error (dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
                        (unsigned long) le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd..ae94f6d949f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        if (ext3_journal_dirty_metadata(handle, bh))
+                                return;
                }
                ext3_mark_inode_dirty(handle, inode);
                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext3_journal_get_write_access(handle, bh);
+                        if (ext3_journal_get_write_access(handle, bh))
+                                return;
                }
        }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783..fc080dd561f 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case FITRIM: {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                                   sizeof(range)))
+                        return -EFAULT;
+                ret = ext3_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                                 sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..b27ba71810e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
        unsigned long start, block, b;
+        const u8 *name = entry->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        namelen = entry->len;
        if (namelen > EXT3_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == 0)) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
                /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
                        struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
                        int *err)
 {
-        struct super_block * sb;
+        struct super_block *sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext3_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        unsigned long block;
        int retval;
-        int namelen = entry->len;
-        const u8 *name = entry->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-                if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
                if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
                        goto errout;
-                de = (struct ext3_dir_entry_2 *) bh->b_data;
-                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de)) {
-                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext3_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, entry,
-                                *res_dir = de;
+                                         block << EXT3_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {
-                        }
+                        dx_release(frames);
+                        return bh;
                }
-                brelse (bh);
+                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext3_htree_next_block(dir, hash, frame,
+                retval = ext3_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext3_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext3_journal_dirty_metadata(handle, frames[0].bh);
+                err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+                if (err)
+                        goto journal_error;
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
                if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
+                        int err;
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext3_journal_get_write_access(handle, bh);
+                        err = ext3_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto journal_error;
                        if (pde)
                                pde->rec_len = ext3_rec_len_to_disk(
                                        ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        err = ext3_journal_dirty_metadata(handle, bh);
+                        if (err) {
+journal_error:
+                                ext3_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
        handle_t *handle;
        struct inode * inode;
-        struct buffer_head * dir_block;
+        struct buffer_head * dir_block = NULL;
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
@@ -1790,15 +1793,14 @@ retry:
        inode->i_fop = &ext3_dir_operations;
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                drop_nlink(inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                unlock_new_inode(inode);
-                ext3_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext3_journal_get_write_access(handle, dir_block);
+        err = ext3_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-        ext3_journal_dirty_metadata(handle, dir_block);
+        err = ext3_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        if (err)
-        ext3_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext3_add_entry (handle, dentry, inode);
+        err = ext3_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext3_add_entry (handle, dentry, inode);
        if (err) {
+out_clear_inode:
                inode->i_nlink = 0;
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
        }
        inc_nlink(dir);
        ext3_update_dx_flag(dir);
-        ext3_mark_inode_dirty(handle, dir);
+        err = ext3_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext3_journal_get_write_access(handle, new_bh);
+                retval = ext3_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
                ext3_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, new_bh);
+                retval = ext3_journal_dirty_metadata(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        ext3_update_dx_flag(old_dir);
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext3_journal_get_write_access(handle, dir_bh);
+                retval = ext3_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto journal_error;
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, dir_bh);
+                retval = ext3_journal_dirty_metadata(handle, dir_bh);
+                if (retval) {
+journal_error:
+                        ext3_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b123..108b142e11e 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, it);
+                err = ext3_journal_dirty_metadata(handle, it);
+                if (err) {
+                        brelse(it);
+                        goto exit_bh;
+                }
                brelse(it);
                ext3_set_bit(bit, bh->b_data);
        }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
+        if (err)
+                goto exit_bh;
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
        brelse(bh);
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-        ext3_journal_dirty_metadata(handle, dind);
+        err = ext3_journal_dirty_metadata(handle, dind);
+        if (err)
+                goto exit_group_desc;
        brelse(dind);
+        dind = NULL;
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-        ext3_mark_iloc_dirty(handle, inode, &iloc);
+        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+        if (err)
+                goto exit_group_desc;
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext3_journal_dirty_metadata(handle, *primary);
+        err = ext3_journal_dirty_metadata(handle, *primary);
+        if (err)
+                goto exit_group_desc;
        o_group_desc = EXT3_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto exit_inode;
        return 0;
+exit_group_desc:
+        kfree(n_group_desc);
 exit_inode:
        //ext3_journal_release_buffer(handle, iloc.bh);
        brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
                }
                ext3_debug("update metadata backup %#04lx\n",
                          (unsigned long)bh->b_blocknr);
-                if ((err = ext3_journal_get_write_access(handle, bh)))
+                if ((err = ext3_journal_get_write_access(handle, bh))) {
+                        brelse(bh);
                        break;
+                }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext3_journal_dirty_metadata(handle, bh);
+                err = ext3_journal_dirty_metadata(handle, bh);
                brelse(bh);
+                if (err)
+                        break;
        }
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext3_journal_dirty_metadata(handle, primary);
+        err = ext3_journal_dirty_metadata(handle, primary);
+        if (err)
+                goto exit_journal;
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT3_INODES_PER_GROUP(sb));
-        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+        if (err) {
+                ext3_warning(sb, __func__,
+                             "error %d on journal dirty metadata", err);
+                ext3_journal_stop(handle);
+                goto exit_put;
+        }
        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index acf8695fa8f..b7d0554631e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext3_error (struct super_block * sb, const char * function,
+void ext3_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        ext3_handle_error(sb);
@@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext3_abort (struct super_block * sb, const char * function,
+void ext3_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                 const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
-void ext3_warning (struct super_block * sb, const char * function,
+void ext3_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -479,6 +497,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
+static void ext3_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
 static void ext3_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -489,7 +514,7 @@ static void ext3_destroy_inode(struct inode *inode)
                                false);
                dump_stack();
        }
-        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+        call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 static void init_once(void *foo)
@@ -1841,13 +1866,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
-                                      le32_to_cpu(es->s_blocks_count))) {
+                                        le32_to_cpu(es->s_blocks_count));
+        if (err) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
                        ext3_msg(sb, KERN_ERR,
                                "error: CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -2290,7 +2317,7 @@ static int ext3_load_journal(struct super_block *sb,
        EXT3_SB(sb)->s_journal = journal;
        ext3_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa8..32e6cc23bd9 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext3_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ea..e0270d1f8d8 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac..dec821168fd 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c67..adf96b82278 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40..164c56092e5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1f253a9a141..1de65f57203 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -561,22 +561,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
@@ -709,6 +693,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -719,6 +705,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -856,9 +845,11 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
-        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -917,11 +908,20 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
 #define ext4_clear_bit                  ext2_clear_bit
@@ -1087,6 +1087,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1237,24 +1238,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
-EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
-EXT4_INODE_BIT_FNS(state, state_flags)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd..2e29abb30f7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c..d8b992e658c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 966ecb0d8f8..d202d765dad 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especiially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return (ext4_ext_pblock(ex) +
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
-                                (block - le32_to_cpu(ex->ee_block)));
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 * Handle EOFBLOCKS_FL flag, clearing it if necessary
 */
 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
-                              struct ext4_map_blocks *map,
+                              ext4_lblk_t lblk,
                              struct ext4_ext_path *path,
                              unsigned int len)
 {
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
         * this turns out to be false, we can bail out from this
         * function immediately.
         */
-        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
            ext4_ext_get_actual_len(last_ex))
                return 0;
        /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                        path);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
-                        err = check_eofblocks_fl(handle, inode, map, path,
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
-                                                 map->m_len);
+                                                 path, map->m_len);
                } else
                        err = ret;
                goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
-                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
                if (err < 0)
                        goto out2;
        }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-        if (cache_type) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
                                ext4_ext_put_in_cache(inode, ee_block,
-                                                        ee_len, ee_start,
+                                                        ee_len, ee_start);
-                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
        if (err)
                goto out2;
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3767,7 +3789,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        logical =  (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
                pgoff_t offset;
                struct page *page;
                struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddcee..bb003dc9fff 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf..7829b287822 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
 * to written.
 * The function return the number of pending IOs on success.
 */
-static int flush_completed_IO(struct inode *inode)
+extern int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
                return ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..eb9097aec6f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ef9d5be0b2a..9f7f9e49914 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -54,10 +56,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
        trace_ext4_begin_ordered_truncate(inode, new_size);
-        return jbd2_journal_begin_ordered_truncate(
+        /*
-                                        EXT4_SB(inode->i_sb)->s_journal,
+         * If jinode is zero, then we never opened the file for
-                                        &EXT4_I(inode)->jinode,
+         * writing, so there's no need to call
-                                        new_size);
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -552,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -591,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -711,9 +726,11 @@ failed_out:
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -826,6 +843,7 @@ failed:
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -1081,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1320,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1350,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1878,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2239,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -3720,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -4045,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4169,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4184,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -4349,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4859,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5118,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d5..851f49b2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
 static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == -EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
-        return ret;
 }
 /*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, count = 0, count2 = 0;
+        int err, ret, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, entry->group,
+                        ret = ext4_issue_discard(sb, entry->group,
                                        entry->start_blk, entry->count);
+                        if (unlikely(ret == -EOPNOTSUPP)) {
+                                ext4_warning(sb, "discard not supported, "
+                                                 "disabling");
+                                clear_opt(sb, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
        }
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4643,7 +4635,6 @@ do_more:
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
-        if (ret)
-                ext4_std_error(sb, ret);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
        uint64_t start, len, minlen, trimmed;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
@@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
                else
-                        last_block = len;
+                        last_block = first_block + len;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b72..b0a126f23c2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dc40e75cba8..5485390d32c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beacce11ac5..7270dcfca92 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -44,7 +44,7 @@ int __init ext4_init_pageio(void)
        if (io_page_cachep == NULL)
                return -ENOMEM;
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-        if (io_page_cachep == NULL) {
+        if (io_end_cachep == NULL) {
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
@@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work)
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-        ext4_io_end_t *io = NULL;
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
-                memset(io, 0, sizeof(*io));
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_WORK(&io->work, ext4_end_io_work);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..3ecc6e45d2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -232,6 +236,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                               GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
@@ -247,13 +253,20 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = input->inode_table - start;
+             i < sbi->s_itb_per_group; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -265,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -417,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -449,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -465,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
        /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -660,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -878,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb15c9c0be7..29c80f6d8b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -808,21 +818,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
@@ -841,6 +845,13 @@ static int ext4_drop_inode(struct inode *inode)
        return drop;
 }
+static void ext4_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
        ext4_ioend_wait(inode);
@@ -853,7 +864,7 @@ static void ext4_destroy_inode(struct inode *inode)
                                true);
                dump_stack();
        }
-        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+        call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
@@ -891,9 +902,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1386,7 +1400,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1441,21 +1455,21 @@ static int parse_options(char *options, struct super_block *sb,
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1473,38 +1487,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1514,10 +1528,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1536,7 +1550,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1549,14 +1563,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1599,15 +1613,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1647,12 +1661,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1660,9 +1674,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1688,7 +1702,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1697,9 +1711,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1723,17 +1737,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
                        break;
                case Opt_mblk_io_submit:
-                        set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        set_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_nomblk_io_submit:
-                        clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1743,13 +1757,13 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
@@ -1773,7 +1787,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1782,24 +1796,24 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_init_inode_table:
-                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        set_opt(sb, INIT_INODE_TABLE);
                        if (args[0].from) {
                                if (match_int(&args[0], &option))
                                        return 0;
@@ -1810,7 +1824,7 @@ set_qf_format:
                        sbi->s_li_wait_mult = option;
                        break;
                case Opt_noinit_inode_table:
-                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1822,10 +1836,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1895,12 +1909,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return res;
 }
@@ -1930,14 +1944,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2916,7 +2929,7 @@ static int ext4_register_li_request(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-        int ret;
+        int ret = 0;
        if (sbi->s_li_request != NULL)
                return 0;
@@ -3071,41 +3084,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-                set_opt(sbi->s_mount_opt, XATTR_USER);
+                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
+                set_opt(sb, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3114,7 +3127,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -3122,7 +3135,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3425,8 +3438,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3464,9 +3477,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3556,18 +3569,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -4166,6 +4179,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -4186,6 +4215,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4339,6 +4369,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b..fc32176eee3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c3..206351af7c5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
+static void fat_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, fat_i_callback);
+}
 static void init_once(void *foo)
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -743,7 +750,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
         */
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
-                result->d_op = sb->s_root->d_op;
+                d_set_d_op(result, sb->s_root->d_op);
        return result;
 }
@@ -793,7 +800,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
        parent = d_obtain_alias(inode);
        if (!IS_ERR(parent))
-                parent->d_op = sb->s_root->d_op;
+                d_set_d_op(parent, sb->s_root->d_op);
 out:
        unlock_super(sb);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd..35ffe43afa4 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
 * that the existing dentry can be used. The msdos fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+               struct qstr *qstr)
 {
        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
        unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
 * Compare two msdos names. If either of the names are invalid,
 * we fall back to doing the standard name comparison.
 */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+        struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
        unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
        int error;
-        error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+        error = msdos_format_name(name->name, name->len, a_msdos_name, options);
        if (error)
                goto old_compare;
-        error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+        error = msdos_format_name(str, len, b_msdos_name, options);
        if (error)
                goto old_compare;
        error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
 old_compare:
        error = 1;
-        if (a->len == b->len)
+        if (name->len == len)
-                error = memcmp(a->name, b->name, a->len);
+                error = memcmp(name->name, str, len);
        goto out;
 }
@@ -224,10 +227,10 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        }
 out:
        unlock_super(sb);
-        dentry->d_op = &msdos_dentry_operations;
+        d_set_d_op(dentry, &msdos_dentry_operations);
        dentry = d_splice_alias(inode, dentry);
        if (dentry)
-                dentry->d_op = &msdos_dentry_operations;
+                d_set_d_op(dentry, &msdos_dentry_operations);
        return dentry;
 error:
@@ -670,7 +673,7 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
        }
        sb->s_flags |= MS_NOATIME;
-        sb->s_root->d_op = &msdos_dentry_operations;
+        d_set_d_op(sb->s_root, &msdos_dentry_operations);
        unlock_super(sb);
        return 0;
 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b892..e3ffc5e1233 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /* This is not negative dentry. Always valid. */
        if (dentry->d_inode)
                return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
         *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-        unsigned int len = qstr->len;
+        while (len && name[len - 1] == '.')
-        while (len && qstr->name[len - 1] == '.')
                len--;
        return len;
 }
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+        return __vfat_striptail_len(qstr->len, qstr->name);
+}
 /*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
        return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
        unsigned int len;
        unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
 * Case insensitive compare of two vfat names.
 */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+                if (nls_strnicmp(t, name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
 * Case sensitive compare of two vfat names.
 */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (strncmp(name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -757,11 +772,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        unlock_super(sb);
-        dentry->d_op = sb->s_root->d_op;
+        d_set_d_op(dentry, sb->s_root->d_op);
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
        if (dentry) {
-                dentry->d_op = sb->s_root->d_op;
+                d_set_d_op(dentry, sb->s_root->d_op);
                dentry->d_time = dentry->d_parent->d_inode->i_version;
        }
        return dentry;
@@ -1063,9 +1078,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (MSDOS_SB(sb)->options.name_check != 's')
-                sb->s_root->d_op = &vfat_ci_dentry_ops;
+                d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
        else
-                sb->s_root->d_op = &vfat_dentry_ops;
+                d_set_d_op(sb->s_root, &vfat_dentry_ops);
        unlock_super(sb);
        return 0;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8ee..751d6b255a1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
+        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079..2ba6719ac61 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
        return ip;
 }
+static void vxfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
 /**
 * vxfs_evict_inode - remove inode from main memory
 * @ip:         inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
        truncate_inode_pages(&ip->i_data, 0);
        end_writeback(ip);
-        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+        call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3..68ca487bedb 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -14,12 +14,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
        struct path old_root;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
-        path_get(path);
+        path_get_long(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
-                path_put(&old_root);
+                path_put_long(&old_root);
 }
 /*
@@ -31,13 +33,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
        struct path old_pwd;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
-        path_get(path);
+        path_get_long(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
-                path_put(&old_pwd);
+                path_put_long(&old_pwd);
 }
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +56,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                fs = p->fs;
                if (fs) {
                        spin_lock(&fs->lock);
+                        write_seqcount_begin(&fs->seq);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_long(new_root);
                                fs->root = *new_root;
                                count++;
                        }
                        if (fs->pwd.dentry == old_root->dentry
                            && fs->pwd.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_long(new_root);
                                fs->pwd = *new_root;
                                count++;
                        }
+                        write_seqcount_end(&fs->seq);
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
-                path_put(old_root);
+                path_put_long(old_root);
 }
 void free_fs_struct(struct fs_struct *fs)
 {
-        path_put(&fs->root);
+        path_put_long(&fs->root);
-        path_put(&fs->pwd);
+        path_put_long(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
 }
@@ -88,8 +94,10 @@ void exit_fs(struct task_struct *tsk)
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
+                write_seqcount_begin(&fs->seq);
                tsk->fs = NULL;
                kill = !--fs->users;
+                write_seqcount_end(&fs->seq);
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
@@ -105,8 +113,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
+                seqcount_init(&fs->seq);
                fs->umask = old->umask;
-                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
+                spin_lock(&old->lock);
+                fs->root = old->root;
+                path_get_long(&fs->root);
+                fs->pwd = old->pwd;
+                path_get_long(&fs->pwd);
+                spin_unlock(&old->lock);
        }
        return fs;
 }
@@ -144,6 +159,7 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
        .users          = 1,
        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
+        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308d..cf8d28d1fba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+                       u64 nodeid, u64 nlookup)
+{
+        forget->forget_one.nodeid = nodeid;
+        forget->forget_one.nlookup = nlookup;
+        spin_lock(&fc->lock);
+        fc->forget_list_tail->next = forget;
+        fc->forget_list_tail = forget;
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        spin_unlock(&fc->lock);
+}
 static void flush_bg_queue(struct fuse_conn *fc)
 {
        while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-        req->isreply = 0;
-        fuse_request_send_nowait(fc, req);
-}
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int forget_pending(struct fuse_conn *fc)
+{
+        return fc->forget_list_head.next != NULL;
+}
 static int request_pending(struct fuse_conn *fc)
 {
-        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+                forget_pending(fc);
 }
 /* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+                                               unsigned max,
+                                               unsigned *countp)
+{
+        struct fuse_forget_link *head = fc->forget_list_head.next;
+        struct fuse_forget_link **newhead = &head;
+        unsigned count;
+        for (count = 0; *newhead != NULL && count < max; count++)
+                newhead = &(*newhead)->next;
+        fc->forget_list_head.next = *newhead;
+        *newhead = NULL;
+        if (fc->forget_list_head.next == NULL)
+                fc->forget_list_tail = &fc->forget_list_head;
+        if (countp != NULL)
+                *countp = count;
+        return head;
+}
+static int fuse_read_single_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs,
+                                   size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+        struct fuse_forget_in arg = {
+                .nlookup = forget->forget_one.nlookup,
+        };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_FORGET,
+                .nodeid = forget->forget_one.nodeid,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        spin_unlock(&fc->lock);
+        kfree(forget);
+        if (nbytes < ih.len)
+                return -EINVAL;
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        unsigned max_forgets;
+        unsigned count;
+        struct fuse_forget_link *head;
+        struct fuse_batch_forget_in arg = { .count = 0 };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_BATCH_FORGET,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        if (nbytes < ih.len) {
+                spin_unlock(&fc->lock);
+                return -EINVAL;
+        }
+        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+        head = dequeue_forget(fc, max_forgets, &count);
+        spin_unlock(&fc->lock);
+        arg.count = count;
+        ih.len += count * sizeof(struct fuse_forget_one);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        while (head) {
+                struct fuse_forget_link *forget = head;
+                if (!err) {
+                        err = fuse_copy_one(cs, &forget->forget_one,
+                                            sizeof(forget->forget_one));
+                }
+                head = forget->next;
+                kfree(forget);
+        }
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+                            size_t nbytes)
+__releases(fc->lock)
+{
+        if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+                return fuse_read_single_forget(fc, cs, nbytes);
+        else
+                return fuse_read_batch_forget(fc, cs, nbytes);
+}
 /*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
+        if (forget_pending(fc)) {
+                if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+                        return fuse_read_forget(fc, cs, nbytes);
+                if (fc->forget_batch <= -8)
+                        fc->forget_batch = 16;
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
        flush_bg_queue(fc);
        end_requests(fc, &fc->pending);
        end_requests(fc, &fc->processing);
+        while (forget_pending(fc))
+                kfree(dequeue_forget(fc, 1, NULL));
 }
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482..042af7346ec 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
 */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-        struct inode *inode = entry->d_inode;
+        struct inode *inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
-                struct fuse_req *forget_req;
+                struct fuse_forget_link *forget;
                struct dentry *parent;
                u64 attr_version;
@@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                forget_req = fuse_get_req(fc);
+                forget = fuse_alloc_forget();
-                if (IS_ERR(forget_req)) {
+                if (!forget) {
                        fuse_put_request(fc, req);
                        return 0;
                }
@@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!err) {
                        struct fuse_inode *fi = get_fuse_inode(inode);
                        if (outarg.nodeid != get_node_id(inode)) {
-                                fuse_send_forget(fc, forget_req,
+                                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-                                                 outarg.nodeid, 1);
                                return 0;
                        }
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
-                fuse_put_request(fc, forget_req);
+                kfree(forget);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        return 0;
@@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        u64 attr_version;
        int err;
@@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (IS_ERR(req))
                goto out;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        err = PTR_ERR(forget_req);
+        err = -ENOMEM;
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
                goto out;
        }
@@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                           attr_version);
        err = -ENOMEM;
        if (!*inode) {
-                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg->nodeid, 1);
                goto out;
        }
        err = 0;
 out_put_forget:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
 out:
        return err;
 }
@@ -347,7 +350,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        }
        entry = newent ? newent : entry;
-        entry->d_op = &fuse_dentry_operations;
+        d_set_d_op(entry, &fuse_dentry_operations);
        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
@@ -374,7 +377,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -388,9 +391,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (flags & O_DIRECT)
                return -EINVAL;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req))
+        if (!forget)
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        req = fuse_get_req(fc);
        err = PTR_ERR(req);
@@ -448,10 +451,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                fuse_sync_release(ff, flags);
-                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+                fuse_queue_forget(fc, forget, outentry.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        d_instantiate(entry, inode);
        fuse_change_entry_timeout(entry, &outentry);
        fuse_invalidate_attr(dir);
@@ -469,7 +472,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 out_put_request:
        fuse_put_request(fc, req);
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -483,12 +486,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        struct fuse_entry_out outarg;
        struct inode *inode;
        int err;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        }
        memset(&outarg, 0, sizeof(outarg));
@@ -515,10 +518,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                          &outarg.attr, entry_attr_timeout(&outarg), 0);
        if (!inode) {
-                fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -541,7 +544,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        return 0;
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -981,12 +984,15 @@ static int fuse_access(struct inode *inode, int mask)
 * access request is sent.  Execute permission is still checked
 * locally based on file mode.
 */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        bool refreshed = false;
        int err = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1001,7 +1007,7 @@ static int fuse_permission(struct inode *inode, int mask)
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
                /* If permission is denied, try to refresh file
                   attributes.  This is also needed, because the root
@@ -1009,7 +1015,8 @@ static int fuse_permission(struct inode *inode, int mask)
                if (err == -EACCES && !refreshed) {
                        err = fuse_do_getattr(inode, NULL, NULL);
                        if (!err)
-                                err = generic_permission(inode, mask, NULL);
+                                err = generic_permission(inode, mask,
+                                                        flags, NULL);
                }
                /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8b984a2cebb..95da1bc1c82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
-static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
-                                 size_t transferred, unsigned count,
+                                     size_t transferred, unsigned count,
-                                 bool is_compat)
+                                     bool is_compat)
 {
 #ifdef CONFIG_COMPAT
        if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
        return 0;
 }
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+                                 void *src, size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+        unsigned i;
+        struct fuse_ioctl_iovec *fiov = src;
+        if (fc->minor < 16) {
+                return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+                                                 count, is_compat);
+        }
+        if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+                return -EIO;
+        for (i = 0; i < count; i++) {
+                /* Did the server supply an inappropriate value? */
+                if (fiov[i].base != (unsigned long) fiov[i].base ||
+                    fiov[i].len != (unsigned long) fiov[i].len)
+                        return -EIO;
+                dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+                dst[i].iov_len = (size_t) fiov[i].len;
+#ifdef CONFIG_COMPAT
+                if (is_compat &&
+                    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+                     (compat_size_t) dst[i].iov_len != fiov[i].len))
+                        return -EIO;
+#endif
+        }
+        return 0;
+}
 /*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        struct fuse_ioctl_out outarg;
        struct fuse_req *req = NULL;
        struct page **pages = NULL;
-        struct page *iov_page = NULL;
+        struct iovec *iov_page = NULL;
        struct iovec *in_iov = NULL, *out_iov = NULL;
        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
        size_t in_size, out_size, transferred;
        int err;
+#if BITS_PER_LONG == 32
+        inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+        if (flags & FUSE_IOCTL_COMPAT)
+                inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
        /* assume all the iovs returned by client always fits in a page */
-        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-        iov_page = alloc_page(GFP_KERNEL);
+        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         * RETRY from server is not allowed.
         */
        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-                struct iovec *iov = page_address(iov_page);
+                struct iovec *iov = iov_page;
                iov->iov_base = (void __user *)arg;
                iov->iov_len = _IOC_SIZE(cmd);
@@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        /* did it ask for retry? */
        if (outarg.flags & FUSE_IOCTL_RETRY) {
-                char *vaddr;
+                void *vaddr;
                /* no retry if in restricted mode */
                err = -EIO;
@@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                        goto out;
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+                err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
                                            transferred, in_iovs + out_iovs,
                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
                if (err)
                        goto out;
-                in_iov = page_address(iov_page);
+                in_iov = iov_page;
                out_iov = in_iov + in_iovs;
                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
@@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 out:
        if (req)
                fuse_put_request(fc, req);
-        if (iov_page)
+        free_page((unsigned long) iov_page);
-                __free_page(iov_page);
        while (num_pages)
                __free_page(pages[--num_pages]);
        kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f10..ae5744a2f9e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
+/* One forget request */
+struct fuse_forget_link {
+        struct fuse_forget_one forget_one;
+        struct fuse_forget_link *next;
+};
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
        u64 nlookup;
        /** The request used for sending the FORGET message */
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        /** Time in jiffies until the file attributes are valid */
        u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
-                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
                        struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
        /** Pending interrupts */
        struct list_head interrupts;
+        /** Queue of pending forgets */
+        struct fuse_forget_link forget_list_head;
+        struct fuse_forget_link *forget_list_tail;
+        /** Batching of FORGET requests (positive indicates FORGET batch) */
+        int forget_batch;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
 * Send FORGET command
 */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
-                      u64 nodeid, u64 nlookup);
+                       u64 nodeid, u64 nlookup);
+struct fuse_forget_link *fuse_alloc_forget(void);
 /**
 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-/**
 * Send a request in the background
 */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a9..f62b32cffea 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
        unsigned blksize;
 };
+struct fuse_forget_link *fuse_alloc_forget()
+{
+        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
        struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
        init_waitqueue_head(&fi->page_waitq);
-        fi->forget_req = fuse_request_alloc();
+        fi->forget = fuse_alloc_forget();
-        if (!fi->forget_req) {
+        if (!fi->forget) {
                kmem_cache_free(fuse_inode_cachep, inode);
                return NULL;
        }
@@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void fuse_destroy_inode(struct inode *inode)
+static void fuse_i_callback(struct rcu_head *head)
 {
-        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct inode *inode = container_of(head, struct inode, i_rcu);
-        BUG_ON(!list_empty(&fi->write_files));
+        INIT_LIST_HEAD(&inode->i_dentry);
-        BUG_ON(!list_empty(&fi->queued_writes));
-        if (fi->forget_req)
-                fuse_request_free(fi->forget_req);
        kmem_cache_free(fuse_inode_cachep, inode);
 }
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+static void fuse_destroy_inode(struct inode *inode)
-                      u64 nodeid, u64 nlookup)
 {
-        struct fuse_forget_in *inarg = &req->misc.forget_in;
+        struct fuse_inode *fi = get_fuse_inode(inode);
-        inarg->nlookup = nlookup;
+        BUG_ON(!list_empty(&fi->write_files));
-        req->in.h.opcode = FUSE_FORGET;
+        BUG_ON(!list_empty(&fi->queued_writes));
-        req->in.h.nodeid = nodeid;
+        kfree(fi->forget);
-        req->in.numargs = 1;
+        call_rcu(&inode->i_rcu, fuse_i_callback);
-        req->in.args[0].size = sizeof(struct fuse_forget_in);
-        req->in.args[0].value = inarg;
-        fuse_request_send_noreply(fc, req);
 }
 static void fuse_evict_inode(struct inode *inode)
@@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
-                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-                fi->forget_req = NULL;
+                fi->forget = NULL;
        }
 }
@@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->interrupts);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
+        fc->forget_list_tail = &fc->forget_list_head;
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -619,7 +618,7 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
        entry = d_obtain_alias(inode);
        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
-                entry->d_op = &fuse_dentry_operations;
+                d_set_d_op(entry, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(entry);
        }
@@ -721,7 +720,7 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        parent = d_obtain_alias(inode);
        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
-                parent->d_op = &fuse_dentry_operations;
+                d_set_d_op(parent, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(parent);
        }
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a69..06c48a89183 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
 }
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
-        if (acl) {
+                        return -ECHILD;
-                int error = posix_acl_permission(inode, acl, mask);
+        } else {
-                posix_acl_release(acl);
+                struct posix_acl *acl;
-                return error;
+                acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+                if (acl) {
+                        int error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                        return error;
+                }
        }
        return -EAGAIN;
 }
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943..7118f1a780a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
 * Returns: errno
 */
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39e..a93907c8159 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES            25
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4e..3c4039d5eef 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-        int error;
+        int error = 0;
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        else if (!sdp->sd_rgrps)
+                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b385..4a456338b87 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 #include "gfs2.h"
@@ -34,15 +35,23 @@
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *parent = dget_parent(dentry);
+        struct dentry *parent;
-        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_sbd *sdp;
-        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct gfs2_inode *dip;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
        int had_lock = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        parent = dget_parent(dentry);
+        sdp = GFS2_SB(parent->d_inode);
+        dip = GFS2_I(parent->d_inode);
+        inode = dentry->d_inode;
        if (inode) {
                if (is_bad_inode(inode))
                        goto invalid;
@@ -100,13 +109,14 @@ fail:
        return 0;
 }
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        str->hash = gfs2_disk_hash(str->name, str->len);
        return 0;
 }
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
        struct gfs2_inode *ginode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5ab3839dfcb..97012ecff56 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -130,7 +130,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
        dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
+                d_set_d_op(dentry, &gfs2_dops);
        return dentry;
 }
@@ -158,7 +158,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 out_inode:
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
+                d_set_d_op(dentry, &gfs2_dops);
        return dentry;
 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5..fca6689e12e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -241,7 +241,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = gfs2_permission(inode, MAY_WRITE);
+                error = gfs2_permission(inode, MAY_WRITE, 0);
                if (error)
                        goto out;
        }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c1770416..08a8beb152e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = LM_OUT_ERROR;
-        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                         req_state, flags);
-        return ret;
-}
 /**
 * do_xmote - Calls the DLM to change the state of a lock
 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-        BUG_ON(gl->gl_state == target);
+        GLOCK_BUG_ON(gl, gl->gl_state == target);
-        BUG_ON(gl->gl_state == gl->gl_target);
+        GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+        gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-        ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
-        if (!(ret & LM_OUT_ASYNC)) {
+        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
-                finish_xmote(gl, ret);
+                /* lock_dlm */
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+                GLOCK_BUG_ON(gl, ret);
+        } else { /* lock_nolock */
+                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-        } else {
-                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
        spin_lock(&gl->gl_spin);
 }
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-                printk(KERN_ERR " ");
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
+                printk(KERN_ERR " %pV", &vaf);
        }
        va_end(args);
 }
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
 *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
 */
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-                spin_unlock(&gl->gl_spin);
        }
+        spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+        smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-        char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
-        sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-                  state2str(gh->gh_state),
+                       state2str(gh->gh_state),
-                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                  gh->gh_error, 
+                       gh->gh_error,
-                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                  gh_owner ? gh_owner->comm : "(ended)", buffer);
+                       gh_owner ? gh_owner->comm : "(ended)",
+                       (void *)gh->gh_ip);
        return 0;
 }
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_FREEZEABLE, 0);
+                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d22..691851ceb61 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
  
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
 *
 * LM_OUT_ST_MASK
 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
 * LM_OUT_CANCELED
 * The lock request was canceled.
 *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
 */
 #define LM_OUT_ST_MASK          0x00000003
 #define LM_OUT_CANCELED         0x00000008
-#define LM_OUT_ASYNC            0x00000080
+#define LM_OUT_ERROR            0x00000004
-#define LM_OUT_ERROR            0x00000100
 /*
 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-        unsigned int (*lm_lock) (struct gfs2_glock *gl,
+        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
-                                 unsigned int req_state, unsigned int flags);
+                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
-#define LM_FLAG_TRY             0x00000001
-#define LM_FLAG_TRY_1CB         0x00000002
-#define LM_FLAG_NOEXP           0x00000004
-#define LM_FLAG_ANY             0x00000008
-#define LM_FLAG_PRIORITY        0x00000010
-#define GL_ASYNC                0x00000040
-#define GL_EXACT                0x00000080
-#define GL_SKIP                 0x00000100
-#define GL_NOCACHE              0x00000400
-#define GLR_TRYFAILED           13
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e..263561bf1a5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc..a79790c0627 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -207,12 +208,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
-        unsigned int gl_state;
+        /* State fields protected by gl_spin */
-        unsigned int gl_target;
+        unsigned int gl_state:2,        /* Current state */
-        unsigned int gl_reply;
+                     gl_target:2,       /* Target state */
+                     gl_demote_state:2, /* State requested by remote node */
+                     gl_req:2,          /* State in last dlm request */
+                     gl_reply:8;        /* Last reply from the dlm */
        unsigned int gl_hash;
-        unsigned int gl_req;
-        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f921..2232b3c780b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -509,7 +509,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = gfs2_permission(dir, MAY_EXEC);
+                error = gfs2_permission(dir, MAY_EXEC, 0);
                if (error)
                        goto out;
        }
@@ -539,7 +539,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d8499fadcc5..732a183efdb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -113,7 +113,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
                                  const struct qstr *name,
                                  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45f..6e493aee28f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
-                              unsigned int req_state, unsigned int flags)
+                     unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-        int error;
        int req;
        u32 lkf;
-        gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
-        error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+        return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                         GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-        if (error == -EAGAIN)
-                return 0;
-        if (error)
-                return LM_OUT_ERROR;
-        return LM_OUT_ASYNC;
 }
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b8..2aeabd4218c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
                iput(inode);
                return -ENOMEM;
        }
-        dentry->d_op = &gfs2_dops;
+        d_set_d_op(dentry, &gfs2_dops);
        *dptr = dentry;
        return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c..1501db4f0e6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -106,7 +106,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode = NULL;
-        dentry->d_op = &gfs2_dops;
+        d_set_d_op(dentry, &gfs2_dops);
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
@@ -166,7 +166,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_child;
-        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                goto out_gunlock;
@@ -289,7 +289,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -822,7 +822,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
                if (error)
                        goto out_gunlock;
@@ -857,7 +857,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
                if (error)
                        goto out_gunlock;
        }
@@ -1041,13 +1041,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 * Returns: errno
 */
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip;
        struct gfs2_holder i_gh;
        int error;
        int unlock = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
@@ -1058,7 +1062,7 @@ int gfs2_permission(struct inode *inode, int mask)
        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                error = -EACCES;
        else
-                error = generic_permission(inode, mask, gfs2_check_acl);
+                error = generic_permission(inode, mask, flags, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
@@ -1069,7 +1073,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
@@ -1100,25 +1103,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(sdp, !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf9ba7..a689901963d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                        qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        qd->qd_qb.qb_value = qp->qu_value;
+                }
        }
        /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
        if (fdq->d_fieldmask == 0)
                goto out_i;
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407b876..7293ea27020 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+                if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct inode *inode = &ip->i_inode;
-        struct file_ra_state ra_state;
-        struct gfs2_rgrpd *rgd;
-        unsigned int max_data = 0;
-        int error;
-        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-                /* Ignore partials */
-                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    i_size_read(inode))
-                        break;
-                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
-                        clear_rgrpdi(sdp);
-                        return error;
-                }
-        }
-        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-                if (rgd->rd_data > max_data)
-                        max_data = rgd->rd_data;
-        sdp->sd_max_rg_data = max_data;
-        sdp->sd_rindex_uptodate = 1;
-        return 0;
-}
-/**
 * gfs2_rindex_hold - Grab a lock on the rindex
 * @sdp: The GFS2 superblock
 * @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                        error = gfs2_ri_update_special(ip);
+                        error = gfs2_ri_update(ip);
                if (error)
                        return error;
        }
+try_again:
        do {
                error = get_local_rgrp(ip, &last_unlinked);
                /* If there is no space, flushing the log may release some */
-                if (error)
+                if (error) {
+                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                            !sdp->sd_rindex_uptodate) {
+                                error = gfs2_ri_update(ip);
+                                if (error)
+                                        return error;
+                                goto try_again;
+                        }
                        gfs2_log_flush(sdp, NULL);
+                }
        } while (error && tries++ < 3);
        if (error) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9..50c2bb04369 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430..16c2ecac7eb 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1405,11 +1405,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        return &ip->i_inode;
 }
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(gfs2_inode_cachep, inode);
 }
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
 const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a..439b61c0326 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-        struct buffer_head *dibh;
        int error;
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_trans_end;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(GFS2_SB(inode), !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41..ea4aefe7c65 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,7 +25,7 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        int res;
-        dentry->d_op = &hfs_dentry_operations;
+        d_set_d_op(dentry, &hfs_dentry_operations);
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e84..ad97c2d5828 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+                struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
                      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af7942..495a976a3cc 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
 * Hash a string to an integer in a case-independent way
 */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        const unsigned char *n1, *n2;
-        int len;
-        len = s1->len;
        if (len >= HFS_NAMELEN) {
-                if (s2->len < HFS_NAMELEN)
+                if (name->len < HFS_NAMELEN)
                        return 1;
                len = HFS_NAMELEN;
-        } else if (len != s2->len)
+        } else if (len != name->len)
                return 1;
-        n1 = s1->name;
+        n1 = str;
-        n2 = s2->name;
+        n2 = name->name;
        while (len--) {
                if (caseorder[*n1++] != caseorder[*n2++])
                        return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb..0bef62aa4f4 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
+static void hfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfs_i_callback);
+}
 static const struct super_operations hfs_super_operations = {
        .alloc_inode    = hfs_alloc_inode,
        .destroy_inode  = hfs_destroy_inode,
@@ -427,7 +434,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root)
                goto bail_iput;
-        sb->s_root->d_op = &hfs_dentry_operations;
+        d_set_d_op(sb->s_root, &hfs_dentry_operations);
        /* everything's okay */
        return 0;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219a..19cf291eb91 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
 * This file contains the code to do various system dependent things.
 */
+#include <linux/namei.h>
 #include "hfs_fs.h"
 /* dentry case-handling: just lowercase everything */
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int diff;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if(!inode)
                return 1;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index d182438c7ae..5d799c13205 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
                return -ENOMEM;
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
-        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+                tree->cnid, __builtin_return_address(0));
        mutex_lock(&tree->tree_lock);
        return 0;
 }
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
-        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+                fd->tree->cnid, __builtin_return_address(0));
        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
 }
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ad57f5991eb..1cad80c789c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,7 +15,8 @@
 #define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
-int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+                u32 offset, u32 *max)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct page *page;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba7..1c42cc5b899 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 {
        __be16 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
 }
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
 {
        u8 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 1);
        return data;
 }
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
 void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
 {
        __be16 v = cpu_to_be16(data);
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_write(node, &v, off, 2);
 }
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                                dst_page--;
                        }
                        src -= len;
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, len);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                if (src == dst) {
                        l = min(len, (int)PAGE_CACHE_SIZE - src);
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, l);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
                        while ((len -= l) != 0) {
                                l = min(len, (int)PAGE_CACHE_SIZE);
-                                memmove(kmap(*++dst_page), kmap(*++src_page), l);
+                                memmove(kmap(*++dst_page),
+                                        kmap(*++src_page), l);
                                kunmap(*src_page);
                                set_page_dirty(*dst_page);
                                kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                        do {
                                src_ptr = kmap(*src_page) + src;
                                dst_ptr = kmap(*dst_page) + dst;
-                                if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+                                if (PAGE_CACHE_SIZE - src <
+                                                PAGE_CACHE_SIZE - dst) {
                                        l = PAGE_CACHE_SIZE - src;
                                        src = 0;
                                        dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;
@@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;
-        // move down?
+        /* move down? */
-        if (!node->prev && !node->next) {
+        if (!node->prev && !node->next)
-                printk(KERN_DEBUG "hfs_btree_del_level\n");
+                dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
-        }
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
@@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
        struct hfs_bnode *node;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
-             node; node = node->next_hash) {
+                        node; node = node->next_hash)
-                if (node->this == cnid) {
+                if (node->this == cnid)
                        return node;
-                }
-        }
        return NULL;
 }
@@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        loff_t off;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
@@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        } else {
                spin_unlock(&tree->hash_lock);
                kfree(node);
-                wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+                wait_event(node2->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);
@@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
-                wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+                wait_event(node->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
@@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;
-        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+                        node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
@@ -556,11 +566,13 @@ node_error:
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-        //int i;
+#if 0
+        int i;
-        //for (i = 0; i < node->tree->pages_per_bnode; i++)
+        for (i = 0; i < node->tree->pages_per_bnode; i++)
-        //      if (node->page[i])
+                if (node->page[i])
-        //              page_cache_release(node->page[i]);
+                        page_cache_release(node->page[i]);
+#endif
        kfree(node);
 }
@@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
        if (node) {
                atomic_inc(&node->refcnt);
                dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
        }
 }
@@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
                int i;
                dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2f39d05443e..2312de34bd4 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,7 +39,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
           !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
                retval = node->tree->max_key_len + 2;
        } else {
-                recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+                recoff = hfs_bnode_read_u16(node,
+                        node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
@@ -84,7 +85,8 @@ again:
        end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
        end_off = hfs_bnode_read_u16(node, end_rec_off);
        end_rec_off -= 2;
-        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+                rec, size, end_off, end_rec_off);
        if (size > end_rec_off - end_off) {
                if (new_node)
                        panic("not enough room!\n");
@@ -99,7 +101,9 @@ again:
        }
        node->num_recs++;
        /* write new last offset */
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        hfs_bnode_write_u16(node, end_rec_off, end_off + size);
        data_off = end_off;
        data_rec_off = end_rec_off + 2;
@@ -151,7 +155,8 @@ skip:
                if (tree->attributes & HFS_TREE_VARIDXKEYS)
                        key_len = be16_to_cpu(fd->search_key->key_len) + 2;
                else {
-                        fd->search_key->key_len = cpu_to_be16(tree->max_key_len);
+                        fd->search_key->key_len =
+                                cpu_to_be16(tree->max_key_len);
                        key_len = tree->max_key_len + 2;
                }
                goto again;
@@ -180,7 +185,8 @@ again:
                mark_inode_dirty(tree->inode);
        }
        hfs_bnode_dump(node);
-        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+                fd->record, fd->keylength + fd->entrylength);
        if (!--node->num_recs) {
                hfs_bnode_unlink(node);
                if (!node->parent)
@@ -194,7 +200,9 @@ again:
                __hfs_brec_find(node, fd);
                goto again;
        }
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        if (rec_off == end_off)
                goto skip;
@@ -364,7 +372,8 @@ again:
                newkeylen = hfs_bnode_read_u16(node, 14) + 2;
        else
                fd->keylength = newkeylen = tree->max_key_len + 2;
-        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+                rec, fd->keylength, newkeylen);
        rec_off = tree->node_size - (rec + 2) * 2;
        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -375,7 +384,7 @@ again:
                end_off = hfs_bnode_read_u16(parent, end_rec_off);
                if (end_rec_off - end_off < diff) {
-                        printk(KERN_DEBUG "hfs: splitting index node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
                        fd->bnode = parent;
                        new_node = hfs_bnode_split(fd);
                        if (IS_ERR(new_node))
@@ -383,7 +392,8 @@ again:
                        parent = fd->bnode;
                        rec = fd->record;
                        rec_off = tree->node_size - (rec + 2) * 2;
-                        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+                        end_rec_off = tree->node_size -
+                                (parent->num_recs + 1) * 2;
                }
        }
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 22e4d4e3299..21023d9f8ff 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -51,7 +51,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                goto free_inode;
        /* Load the header */
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        tree->root = be32_to_cpu(head->root);
        tree->leaf_count = be32_to_cpu(head->leaf_count);
        tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -115,7 +116,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
        tree->node_size_shift = ffs(size) - 1;
-        tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        tree->pages_per_bnode =
+                (tree->node_size + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        kunmap(page);
        page_cache_release(page);
@@ -144,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
                while ((node = tree->node_hash[i])) {
                        tree->node_hash[i] = node->next_hash;
                        if (atomic_read(&node->refcnt))
-                                printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n",
+                                printk(KERN_CRIT "hfs: node %d:%d "
-                                        node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                                                "still has %d user(s)!\n",
+                                        node->tree->cnid, node->this,
+                                        atomic_read(&node->refcnt));
                        hfs_bnode_free(node);
                        tree->node_hash_cnt--;
                }
@@ -166,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
                return;
        /* Load the header */
        page = node->page[0];
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        head->root = cpu_to_be32(tree->root);
        head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -272,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                                                tree->free_nodes--;
                                                mark_inode_dirty(tree->inode);
                                                hfs_bnode_put(node);
-                                                return hfs_bnode_create(tree, idx);
+                                                return hfs_bnode_create(tree,
+                                                        idx);
                                        }
                                }
                        }
@@ -287,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                kunmap(*pagep);
                nidx = node->next;
                if (!nidx) {
-                        printk(KERN_DEBUG "hfs: create new bmap node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
                        next_node = hfs_bmap_new_bmap(node, idx);
                } else
                        next_node = hfs_bnode_find(tree, nidx);
@@ -329,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                hfs_bnode_put(node);
                if (!i) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+                        printk(KERN_CRIT "hfs: unable to free bnode %u. "
+                                        "bmap not found!\n",
+                                node->this);
                        return;
                }
                node = hfs_bnode_find(tree, i);
@@ -337,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                        return;
                if (node->type != HFS_NODE_MAP) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+                        printk(KERN_CRIT "hfs: invalid bmap found! "
+                                        "(%u,%d)\n",
+                                node->this, node->type);
                        hfs_bnode_put(node);
                        return;
                }
@@ -350,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
        m = 1 << (~nidx & 7);
        byte = data[off];
        if (!(byte & m)) {
-                printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+                printk(KERN_CRIT "hfs: trying to free free bnode "
+                                "%u(%d)\n",
+                        node->this, node->type);
                kunmap(page);
                hfs_bnode_put(node);
                return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 8af45fc5b05..b4ba1b31933 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -91,7 +91,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
                perms->dev = 0;
 }
-static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+                u32 cnid, struct inode *inode)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
@@ -128,20 +129,32 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                if (cnid == inode->i_ino) {
                        hfsplus_cat_set_perms(inode, &file->permissions);
                        if (S_ISLNK(inode->i_mode)) {
-                                file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
+                                        cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(HFSP_SYMLINK_CREATOR);
                        } else {
-                                file->user_info.fdType = cpu_to_be32(sbi->type);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(sbi->creator);
+                                        cpu_to_be32(sbi->type);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(sbi->creator);
                        }
-                        if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+                        if (HFSPLUS_FLG_IMMUTABLE &
-                                file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+                                        (file->permissions.rootflags |
+                                        file->permissions.userflags))
+                                file->flags |=
+                                        cpu_to_be16(HFSPLUS_FILE_LOCKED);
                } else {
-                        file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
+                        file->user_info.fdType =
-                        file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+                                cpu_to_be32(HFSP_HARDLINK_TYPE);
-                        file->user_info.fdFlags = cpu_to_be16(0x100);
+                        file->user_info.fdCreator =
-                        file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR);
-                        file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
+                        file->user_info.fdFlags =
+                                cpu_to_be16(0x100);
+                        file->create_date =
+                                HFSPLUS_I(sbi->hidden_dir)->create_date;
+                        file->permissions.dev =
+                                cpu_to_be32(HFSPLUS_I(inode)->linkid);
                }
                return sizeof(*file);
        }
@@ -182,12 +195,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
                return -EIO;
        }
-        hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
+        hfsplus_cat_build_key_uni(fd->search_key,
-                                 &tmp.thread.nodeName);
+                be32_to_cpu(tmp.thread.parentID),
+                &tmp.thread.nodeName);
        return hfs_brec_find(fd);
 }
-int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+                struct qstr *str, struct inode *inode)
 {
        struct super_block *sb = dir->i_sb;
        struct hfs_find_data fd;
@@ -195,13 +210,15 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
        int entry_size;
        int err;
-        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+                str->name, cnid, inode->i_nlink);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+        entry_size = hfsplus_fill_cat_thread(sb, &entry,
+                S_ISDIR(inode->i_mode) ?
                        HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
-                        dir->i_ino, str);
+                dir->i_ino, str);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                if (!err)
@@ -227,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
        dir->i_size++;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
        hfs_find_exit(&fd);
        return 0;
@@ -249,7 +267,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
        int err, off;
        u16 type;
-        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
+                str ? str->name : NULL, cnid);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        if (!str) {
@@ -260,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                if (err)
                        goto out;
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_thread, nodeName);
                fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.length, off, 2);
                len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.unicode,
+                        off + 2, len);
                fd.search_key->key_len = cpu_to_be16(6 + len);
        } else
                hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -281,7 +304,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
 #endif
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_file, rsrc_fork);
                hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
        }
@@ -308,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
        dir->i_size--;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_find_exit(&fd);
@@ -325,7 +349,8 @@ int hfsplus_rename_cat(u32 cnid,
        int entry_size, type;
        int err = 0;
-        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+                cnid, src_dir->i_ino, src_name->name,
                dst_dir->i_ino, dst_name->name);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
        dst_fd = src_fd;
@@ -353,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        dst_dir->i_size++;
        dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dst_dir);
        /* finally remove the old entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -365,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        src_dir->i_size--;
        src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(src_dir);
        /* remove old thread entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -379,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
        /* create new thread entry */
        hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name);
+        entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+                dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
@@ -387,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        }
        err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+        hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+        hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_bnode_put(dst_fd.bnode);
        hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9d59c0571f5..f896dc84302 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
-        dentry->d_op = &hfsplus_dentry_operations;
+        d_set_d_op(dentry, &hfsplus_dentry_operations);
        dentry->d_fsdata = NULL;
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
@@ -66,11 +66,17 @@ again:
                        goto fail;
                }
                cnid = be32_to_cpu(entry.file.id);
-                if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+                if (entry.file.user_info.fdType ==
-                    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+                                cpu_to_be32(HFSP_HARDLINK_TYPE) &&
-                    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+                                entry.file.user_info.fdCreator ==
-                     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-                    HFSPLUS_SB(sb)->hidden_dir) {
+                                (entry.file.create_date ==
+                                        HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+                                                create_date ||
+                                entry.file.create_date ==
+                                        HFSPLUS_I(sb->s_root->d_inode)->
+                                                create_date) &&
+                                HFSPLUS_SB(sb)->hidden_dir) {
                        struct qstr str;
                        char name[32];
@@ -83,11 +89,13 @@ again:
                                linkid = 0;
                        } else {
                                dentry->d_fsdata = (void *)(unsigned long)cnid;
-                                linkid = be32_to_cpu(entry.file.permissions.dev);
+                                linkid =
+                                        be32_to_cpu(entry.file.permissions.dev);
                                str.len = sprintf(name, "iNode%d", linkid);
                                str.name = name;
                                hfsplus_cat_build_key(sb, fd.search_key,
-                                        HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
+                                        HFSPLUS_SB(sb)->hidden_dir->i_ino,
+                                        &str);
                                goto again;
                        }
                } else if (!dentry->d_fsdata)
@@ -139,7 +147,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                filp->f_pos++;
                /* fall through */
        case 1:
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
                        printk(KERN_ERR "hfs: bad catalog folder thread\n");
                        err = -EIO;
@@ -169,14 +178,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                type = be16_to_cpu(entry.type);
                len = HFSPLUS_MAX_STRLEN;
                err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
                if (err)
                        goto out;
                if (type == HFSPLUS_FOLDER) {
-                        if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+                        if (fd.entrylength <
+                                        sizeof(struct hfsplus_cat_folder)) {
                                printk(KERN_ERR "hfs: small dir entry\n");
                                err = -EIO;
                                goto out;
@@ -202,7 +213,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-        next:
+next:
                filp->f_pos++;
                if (filp->f_pos >= inode->i_size)
                        goto out;
@@ -273,7 +284,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                HFSPLUS_I(inode)->linkid = id;
                cnid = sbi->next_cnid++;
                src_dentry->d_fsdata = (void *)(unsigned long)cnid;
-                res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
+                res = hfsplus_create_cat(cnid, src_dir,
+                        &src_dentry->d_name, inode);
                if (res)
                        /* panic? */
                        goto out;
@@ -485,6 +497,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 };
 const struct file_operations hfsplus_dir_operations = {
+        .fsync          = hfsplus_file_fsync,
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
        .unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0c9cb1820a5..52a0bcaa7b6 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,8 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
        return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
 }
-static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static void __hfsplus_ext_write_extent(struct inode *inode,
+                struct hfs_find_data *fd)
 {
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
@@ -95,24 +96,32 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
                                HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
        res = hfs_brec_find(fd);
-        if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
+        if (hip->extent_state & HFSPLUS_EXT_NEW) {
                if (res != -ENOENT)
                        return;
                hfs_brec_insert(fd, hip->cached_extents,
                                sizeof(hfsplus_extent_rec));
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        } else {
                if (res)
                        return;
                hfs_bnode_write(fd->bnode, hip->cached_extents,
                                fd->entryoffset, fd->entrylength);
-                hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+                hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
        }
+        /*
+         * We can't just use hfsplus_mark_inode_dirty here, because we
+         * also get called from hfsplus_write_inode, which should not
+         * redirty the inode.  Instead the callers have to be careful
+         * to explicily mark the inode dirty, too.
+         */
+        set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
 }
 static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-        if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
+        if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
                struct hfs_find_data fd;
                hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -144,18 +153,20 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                return -ENOENT;
        if (fd->entrylength != sizeof(hfsplus_extent_rec))
                return -EIO;
-        hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec));
+        hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+                sizeof(hfsplus_extent_rec));
        return 0;
 }
-static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+                struct inode *inode, u32 block)
 {
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
        WARN_ON(!mutex_is_locked(&hip->extents_lock));
-        if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
+        if (hip->extent_state & HFSPLUS_EXT_DIRTY)
                __hfsplus_ext_write_extent(inode, fd);
        res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
@@ -164,10 +175,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
                                                HFSPLUS_TYPE_DATA);
        if (!res) {
                hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
-                hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
+                hip->cached_blocks =
+                        hfsplus_ext_block_count(hip->cached_extents);
        } else {
                hip->cached_start = hip->cached_blocks = 0;
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        }
        return res;
 }
@@ -197,6 +209,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res = -EIO;
        u32 ablock, dblock, mask;
+        int was_dirty = 0;
        int shift;
        /* Convert inode block to disk allocation block */
@@ -223,27 +236,37 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
                return -EIO;
        mutex_lock(&hip->extents_lock);
+        /*
+         * hfsplus_ext_read_extent will write out a cached extent into
+         * the extents btree.  In that case we may have to mark the inode
+         * dirty even for a pure read of an extent here.
+         */
+        was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
        res = hfsplus_ext_read_extent(inode, ablock);
-        if (!res) {
+        if (res) {
-                dblock = hfsplus_ext_find_block(hip->cached_extents,
-                                                ablock - hip->cached_start);
-        } else {
                mutex_unlock(&hip->extents_lock);
                return -EIO;
        }
+        dblock = hfsplus_ext_find_block(hip->cached_extents,
+                                        ablock - hip->cached_start);
        mutex_unlock(&hip->extents_lock);
 done:
-        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
+        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+                inode->i_ino, (long long)iblock, dblock);
        mask = (1 << sbi->fs_shift) - 1;
-        map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
+        map_bh(bh_result, sb,
+                (dblock << sbi->fs_shift) + sbi->blockoffset +
+                        (iblock & mask));
        if (create) {
                set_buffer_new(bh_result);
                hip->phys_size += sb->s_blocksize;
                hip->fs_blocks++;
                inode_add_bytes(inode, sb->s_blocksize);
-                mark_inode_dirty(inode);
        }
+        if (create || was_dirty)
+                mark_inode_dirty(inode);
        return 0;
 }
@@ -326,7 +349,8 @@ found:
        }
 }
-int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type)
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+                struct hfsplus_fork_raw *fork, int type)
 {
        struct hfs_find_data fd;
        hfsplus_extent_rec ext_entry;
@@ -373,12 +397,13 @@ int hfsplus_file_extend(struct inode *inode)
        u32 start, len, goal;
        int res;
-        if (sbi->alloc_file->i_size * 8 <
+        if (sbi->total_blocks - sbi->free_blocks + 8 >
-            sbi->total_blocks - sbi->free_blocks + 8) {
+                        sbi->alloc_file->i_size * 8) {
-                // extend alloc file
+                /* extend alloc file */
-                printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+                printk(KERN_ERR "hfs: extend alloc file! "
-                                sbi->alloc_file->i_size * 8,
+                                "(%llu,%u,%u)\n",
-                                sbi->total_blocks, sbi->free_blocks);
+                        sbi->alloc_file->i_size * 8,
+                        sbi->total_blocks, sbi->free_blocks);
                return -ENOSPC;
        }
@@ -429,7 +454,7 @@ int hfsplus_file_extend(struct inode *inode)
                                         start, len);
                if (!res) {
                        hfsplus_dump_extent(hip->cached_extents);
-                        hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        hip->cached_blocks += len;
                } else if (res == -ENOSPC)
                        goto insert_extent;
@@ -438,7 +463,7 @@ out:
        mutex_unlock(&hip->extents_lock);
        if (!res) {
                hip->alloc_blocks += len;
-                mark_inode_dirty(inode);
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
        }
        return res;
@@ -450,7 +475,7 @@ insert_extent:
        hip->cached_extents[0].start_block = cpu_to_be32(start);
        hip->cached_extents[0].block_count = cpu_to_be32(len);
        hfsplus_dump_extent(hip->cached_extents);
-        hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+        hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
        hip->cached_start = hip->alloc_blocks;
        hip->cached_blocks = len;
@@ -466,8 +491,9 @@ void hfsplus_file_truncate(struct inode *inode)
        u32 alloc_cnt, blk_cnt, start;
        int res;
-        dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+        dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
-                inode->i_ino, (long long)hip->phys_size, inode->i_size);
+                inode->i_ino, (long long)hip->phys_size,
+                inode->i_size);
        if (inode->i_size > hip->phys_size) {
                struct address_space *mapping = inode->i_mapping;
@@ -481,7 +507,8 @@ void hfsplus_file_truncate(struct inode *inode)
                                                &page, &fsdata);
                if (res)
                        return;
-                res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+                res = pagecache_write_end(NULL, mapping, size,
+                        0, 0, page, fsdata);
                if (res < 0)
                        return;
                mark_inode_dirty(inode);
@@ -513,12 +540,12 @@ void hfsplus_file_truncate(struct inode *inode)
                                     alloc_cnt - start, alloc_cnt - blk_cnt);
                hfsplus_dump_extent(hip->cached_extents);
                if (blk_cnt > start) {
-                        hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        break;
                }
                alloc_cnt = start;
                hip->cached_start = hip->cached_blocks = 0;
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
@@ -527,7 +554,8 @@ void hfsplus_file_truncate(struct inode *inode)
        hip->alloc_blocks = blk_cnt;
 out:
        hip->phys_size = inode->i_size;
-        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
+                sb->s_blocksize_bits;
        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
-        mark_inode_dirty(inode);
+        hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57..d6857523336 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,13 +23,16 @@
 #define DBG_EXTENT      0x00000020
 #define DBG_BITMAP      0x00000040
-//#define DBG_MASK      (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#if 0
-//#define DBG_MASK      (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-//#define DBG_MASK      (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#define DBG_MASK        (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
 #define DBG_MASK        (0)
 #define dprint(flg, fmt, args...) \
-        if (flg & DBG_MASK) printk(fmt , ## args)
+        if (flg & DBG_MASK) \
+                printk(fmt , ## args)
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
@@ -37,7 +40,8 @@
 #define HFSPLUS_TYPE_DATA 0x00
 #define HFSPLUS_TYPE_RSRC 0xFF
-typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *);
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
 #define NODE_HASH_SIZE  256
@@ -61,7 +65,6 @@ struct hfs_btree {
        unsigned int max_key_len;
        unsigned int depth;
-        //unsigned int map1_size, map_size;
        struct mutex tree_lock;
        unsigned int pages_per_bnode;
@@ -107,8 +110,8 @@ struct hfsplus_vh;
 struct hfs_btree;
 struct hfsplus_sb_info {
-        struct buffer_head *s_vhbh;
        struct hfsplus_vh *s_vhdr;
+        struct hfsplus_vh *s_backup_vhdr;
        struct hfs_btree *ext_tree;
        struct hfs_btree *cat_tree;
        struct hfs_btree *attr_tree;
@@ -118,7 +121,8 @@ struct hfsplus_sb_info {
        /* Runtime variables */
        u32 blockoffset;
-        u32 sect_count;
+        sector_t part_start;
+        sector_t sect_count;
        int fs_shift;
        /* immutable data from the volume header */
@@ -155,6 +159,12 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_FORCE        2
 #define HFSPLUS_SB_HFSX         3
 #define HFSPLUS_SB_CASEFOLD     4
+#define HFSPLUS_SB_NOBARRIER    5
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
 struct hfsplus_inode_info {
@@ -170,7 +180,7 @@ struct hfsplus_inode_info {
        u32 cached_blocks;
        hfsplus_extent_rec first_extents;
        hfsplus_extent_rec cached_extents;
-        unsigned long flags;
+        unsigned int extent_state;
        struct mutex extents_lock;
        /*
@@ -185,6 +195,11 @@ struct hfsplus_inode_info {
        u32 linkid;
        /*
+         * Accessed using atomic bitops.
+         */
+        unsigned long flags;
+        /*
         * Protected by i_mutex.
         */
        sector_t fs_blocks;
@@ -195,12 +210,34 @@ struct hfsplus_inode_info {
        struct inode vfs_inode;
 };
-#define HFSPLUS_FLG_RSRC        0x0001
+#define HFSPLUS_EXT_DIRTY       0x0001
-#define HFSPLUS_FLG_EXT_DIRTY   0x0002
+#define HFSPLUS_EXT_NEW         0x0002
-#define HFSPLUS_FLG_EXT_NEW     0x0004
+#define HFSPLUS_I_RSRC          0       /* represents a resource fork */
+#define HFSPLUS_I_CAT_DIRTY     1       /* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY     2       /* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY   3       /* has changes in the allocation file */
+#define HFSPLUS_IS_RSRC(inode) \
+        test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+/*
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+                unsigned int flag)
+{
+        set_bit(flag, &HFSPLUS_I(inode)->flags);
+        mark_inode_dirty(inode);
+}
 struct hfs_find_data {
        /* filled by caller */
@@ -318,9 +355,12 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
 int hfs_brec_goto(struct hfs_find_data *, int);
 /* catalog.c */
-int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
-int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+                const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
+void hfsplus_cat_build_key(struct super_block *sb,
+                hfsplus_btree_key *, u32, struct qstr *);
 int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
 int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
@@ -336,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations;
 int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
 void hfsplus_ext_write_extent(struct inode *);
 int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
-int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int);
+int hfsplus_free_fork(struct super_block *, u32,
+                struct hfsplus_fork_raw *, int);
 int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
@@ -351,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
 int hfsplus_cat_write_inode(struct inode *);
 struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
+int hfsplus_file_fsync(struct file *file, int datasync);
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
@@ -362,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* options.c */
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options_remount(char *input, int *force);
 void hfsplus_fill_defaults(struct hfsplus_sb_info *);
 int hfsplus_show_options(struct seq_file *, struct vfsmount *);
@@ -375,45 +418,26 @@ extern u16 hfsplus_decompose_table[];
 extern u16 hfsplus_compose_table[];
 /* unicode.c */
-int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+int hfsplus_strcasecmp(const struct hfsplus_unistr *,
-int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+                const struct hfsplus_unistr *);
-int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
+int hfsplus_strcmp(const struct hfsplus_unistr *,
-int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
+                const struct hfsplus_unistr *);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_uni2asc(struct super_block *,
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+                const struct hfsplus_unistr *, char *, int *);
+int hfsplus_asc2uni(struct super_block *,
+                struct hfsplus_unistr *, const char *, int);
+int hfsplus_hash_dentry(const struct dentry *dentry,
+                const struct inode *inode, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
-/* access macros */
+                void *data, int rw);
-static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
-{
-        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
-}
-#define sb_bread512(sb, sec, data) ({                   \
-        struct buffer_head *__bh;                       \
-        sector_t __block;                               \
-        loff_t __start;                                 \
-        int __offset;                                   \
-                                                        \
-        __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
-        __block = __start >> (sb)->s_blocksize_bits;    \
-        __offset = __start & ((sb)->s_blocksize - 1);   \
-        __bh = sb_bread((sb), __block);                 \
-        if (likely(__bh != NULL))                       \
-                data = (void *)(__bh->b_data + __offset);\
-        else                                            \
-                data = NULL;                            \
-        __bh;                                           \
-})
 /* time macros */
 #define __hfsp_mt2ut(t)         (be32_to_cpu(t) - 2082844800U)
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 6892899fd6f..927cdd6d5bf 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
 #define HFSP_WRAPOFF_EMBEDSIG     0x7C
 #define HFSP_WRAPOFF_EMBEDEXT     0x7E
-#define HFSP_HIDDENDIR_NAME     "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+#define HFSP_HIDDENDIR_NAME \
+        "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
 #define HFSP_HARDLINK_TYPE      0x686c6e6b      /* 'hlnk' */
 #define HFSP_HFSPLUS_CREATOR    0x6866732b      /* 'hfs+' */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8afd7e84f98..a8df651747f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
 * Inode handling routines
 */
+#include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
        if (!tree)
                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
-                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+                nidx = page->index >>
+                        (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
@@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
                }
                spin_unlock(&tree->hash_lock);
        } else {
-                nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+                nidx = page->index <<
+                        (PAGE_CACHE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
@@ -166,8 +169,8 @@ const struct dentry_operations hfsplus_dentry_operations = {
        .d_compare    = hfsplus_compare_dentry,
 };
-static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *hfsplus_file_lookup(struct inode *dir,
-                                          struct nameidata *nd)
+                struct dentry *dentry, struct nameidata *nd)
 {
        struct hfs_find_data fd;
        struct super_block *sb = dir->i_sb;
@@ -190,7 +193,9 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
        inode->i_ino = dir->i_ino;
        INIT_LIST_HEAD(&hip->open_dir_list);
        mutex_init(&hip->extents_lock);
-        hip->flags = HFSPLUS_FLG_RSRC;
+        hip->extent_state = 0;
+        hip->flags = 0;
+        set_bit(HFSPLUS_I_RSRC, &hip->flags);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        err = hfsplus_find_cat(sb, dir->i_ino, &fd);
@@ -219,7 +224,8 @@ out:
        return NULL;
 }
-static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
+static void hfsplus_get_perms(struct inode *inode,
+                struct hfsplus_perm *perms, int dir)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        u16 mode;
@@ -302,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
-static int hfsplus_file_fsync(struct file *filp, int datasync)
+int hfsplus_file_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = filp->f_mapping->host;
+        struct inode *inode = file->f_mapping->host;
-        struct super_block * sb;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
-        int ret, err;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+        int error = 0, error2;
-        /* sync the inode to buffers */
-        ret = write_inode_now(inode, 0);
+        /*
+         * Sync inode metadata into the catalog and extent trees.
-        /* sync the superblock to buffers */
+         */
-        sb = inode->i_sb;
+        sync_inode_metadata(inode, 1);
-        if (sb->s_dirt) {
-                if (!(sb->s_flags & MS_RDONLY))
+        /*
-                        hfsplus_sync_fs(sb, 1);
+         * And explicitly write out the btrees.
-                else
+         */
-                        sb->s_dirt = 0;
+        if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+                error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+        if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+                error2 =
+                        filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+                if (!error)
+                        error = error2;
        }
-        /* .. finally sync the buffers to disk */
+        if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
-        err = sync_blockdev(sb->s_bdev);
+                error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
-        if (!ret)
+                if (!error)
-                ret = err;
+                        error = error2;
-        return ret;
+        }
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static const struct inode_operations hfsplus_file_inode_operations = {
@@ -337,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 };
 static const struct file_operations hfsplus_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .write          = do_sync_write,
@@ -370,6 +388,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        INIT_LIST_HEAD(&hip->open_dir_list);
        mutex_init(&hip->extents_lock);
        atomic_set(&hip->opencnt, 0);
+        hip->extent_state = 0;
        hip->flags = 0;
        memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
@@ -457,7 +476,8 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
        }
 }
-void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+void hfsplus_inode_write_fork(struct inode *inode,
+                struct hfsplus_fork_raw *fork)
 {
        memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
               sizeof(hfsplus_extent_rec));
@@ -499,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_file));
-                hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ?
+                hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
-                                        &file->data_fork : &file->rsrc_fork);
+                                        &file->rsrc_fork : &file->data_fork);
                hfsplus_get_perms(inode, &file->permissions, 0);
                inode->i_nlink = 1;
                if (S_ISREG(inode->i_mode)) {
                        if (file->permissions.dev)
-                                inode->i_nlink = be32_to_cpu(file->permissions.dev);
+                                inode->i_nlink =
+                                        be32_to_cpu(file->permissions.dev);
                        inode->i_op = &hfsplus_file_inode_operations;
                        inode->i_fop = &hfsplus_file_operations;
                        inode->i_mapping->a_ops = &hfsplus_aops;
@@ -578,7 +599,9 @@ int hfsplus_cat_write_inode(struct inode *inode)
                                        sizeof(struct hfsplus_cat_file));
                hfsplus_inode_write_fork(inode, &file->data_fork);
                hfsplus_cat_set_perms(inode, &file->permissions);
-                if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+                if (HFSPLUS_FLG_IMMUTABLE &
+                                (file->permissions.rootflags |
+                                        file->permissions.userflags))
                        file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                else
                        file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
@@ -588,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
        }
+        set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
 out:
        hfs_find_exit(&fd);
        return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 40a85a3ded6..508ce662ce1 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -28,7 +28,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
        if (inode->i_flags & S_IMMUTABLE)
                flags |= FS_IMMUTABLE_FL;
-        if (inode->i_flags |= S_APPEND)
+        if (inode->i_flags & S_APPEND)
                flags |= FS_APPEND_FL;
        if (hip->userflags & HFSPLUS_FLG_NODUMP)
                flags |= FS_NODUMP_FL;
@@ -147,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
                        res = -ERANGE;
        } else
                res = -EOPNOTSUPP;
-        if (!res)
+        if (!res) {
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                sizeof(struct hfsplus_cat_file));
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+        }
 out:
        hfs_find_exit(&fd);
        return res;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index f9ab276a4d8..bb62a588214 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
        opt_umask, opt_uid, opt_gid,
        opt_part, opt_session, opt_nls,
        opt_nodecompose, opt_decompose,
+        opt_barrier, opt_nobarrier,
        opt_force, opt_err
 };
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
        { opt_nls, "nls=%s" },
        { opt_decompose, "decompose" },
        { opt_nodecompose, "nodecompose" },
+        { opt_barrier, "barrier" },
+        { opt_nobarrier, "nobarrier" },
        { opt_force, "force" },
        { opt_err, NULL }
 };
@@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
        return 0;
 }
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        if (!input)
+                return 0;
+        while ((p = strsep(&input, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case opt_force:
+                        *force = 1;
+                        break;
+                default:
+                        break;
+                }
+        }
+        return 1;
+}
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
 int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
@@ -136,7 +165,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                        if (p)
                                sbi->nls = load_nls(p);
                        if (!sbi->nls) {
-                                printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
+                                printk(KERN_ERR "hfs: unable to load "
+                                                "nls mapping \"%s\"\n",
+                                        p);
                                kfree(p);
                                return 0;
                        }
@@ -148,6 +179,12 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                case opt_nodecompose:
                        set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                        break;
+                case opt_barrier:
+                        clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+                        break;
+                case opt_nobarrier:
+                        set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+                        break;
                case opt_force:
                        set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                        break;
@@ -177,7 +214,8 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
                seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
-        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+                sbi->uid, sbi->gid);
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
@@ -186,5 +224,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",nls=%s", sbi->nls->charset);
        if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                seq_printf(seq, ",nodecompose");
+        if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                seq_printf(seq, ",nobarrier");
        return 0;
 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 208b16c645c..d66ad113b1c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
 * linux/fs/hfsplus/part_tbl.c
 *
 * Copyright (C) 1996-1997  Paul H. Hargrove
- * This file may be distributed under the terms of the GNU General Public License.
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
 *
 * Original code to handle the new style Mac partition table based on
 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
@@ -13,6 +14,7 @@
 *
 */
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 /* offsets to various blocks */
@@ -58,77 +60,94 @@ struct new_pmap {
 */
 struct old_pmap {
        __be16          pdSig;  /* Signature bytes */
-        struct  old_pmap_entry {
+        struct old_pmap_entry {
                __be32  pdStart;
                __be32  pdSize;
                __be32  pdFSID;
        }       pdEntry[42];
 } __packed;
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int i;
+        for (i = 0; i < 42; i++) {
+                struct old_pmap_entry *p = &pm->pdEntry[i];
+                if (p->pdStart && p->pdSize &&
+                    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(p->pdStart);
+                        *part_size = be32_to_cpu(p->pdSize);
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int size = be32_to_cpu(pm->pmMapBlkCnt);
+        int res;
+        int i = 0;
+        do {
+                if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(pm->pmPyPartStart);
+                        *part_size = be32_to_cpu(pm->pmPartBlkCnt);
+                        return 0;
+                }
+                if (++i >= size)
+                        return -ENOENT;
+                res = hfsplus_submit_bio(sb->s_bdev,
+                                         *part_start + HFS_PMAP_BLK + i,
+                                         pm, READ);
+                if (res)
+                        return res;
+        } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+        return -ENOENT;
+}
 /*
- * hfs_part_find()
+ * Parse the partition map looking for the start and length of a
- *
+ * HFS/HFS+ partition.
- * Parse the partition map looking for the
- * start and length of the 'part'th HFS partition.
 */
 int hfs_part_find(struct super_block *sb,
-                  sector_t *part_start, sector_t *part_size)
+                sector_t *part_start, sector_t *part_size)
 {
-        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        void *data;
-        struct buffer_head *bh;
+        int res;
-        __be16 *data;
-        int i, size, res;
+        data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
-        res = -ENOENT;
+        res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
-        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
+                                 data, READ);
-        if (!bh)
+        if (res)
-                return -EIO;
+                return res;
-        switch (be16_to_cpu(*data)) {
+        switch (be16_to_cpu(*((__be16 *)data))) {
        case HFS_OLD_PMAP_MAGIC:
-          {
+                res = hfs_parse_old_pmap(sb, data, part_start, part_size);
-                struct old_pmap *pm;
-                struct old_pmap_entry *p;
-                pm = (struct old_pmap *)bh->b_data;
-                p = pm->pdEntry;
-                size = 42;
-                for (i = 0; i < size; p++, i++) {
-                        if (p->pdStart && p->pdSize &&
-                            p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-                            (sbi->part < 0 || sbi->part == i)) {
-                                *part_start += be32_to_cpu(p->pdStart);
-                                *part_size = be32_to_cpu(p->pdSize);
-                                res = 0;
-                        }
-                }
                break;
-          }
        case HFS_NEW_PMAP_MAGIC:
-          {
+                res = hfs_parse_new_pmap(sb, data, part_start, part_size);
-                struct new_pmap *pm;
+                break;
+        default:
-                pm = (struct new_pmap *)bh->b_data;
+                res = -ENOENT;
-                size = be32_to_cpu(pm->pmMapBlkCnt);
-                for (i = 0; i < size;) {
-                        if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-                            (sbi->part < 0 || sbi->part == i)) {
-                                *part_start += be32_to_cpu(pm->pmPyPartStart);
-                                *part_size = be32_to_cpu(pm->pmPartBlkCnt);
-                                res = 0;
-                                break;
-                        }
-                        brelse(bh);
-                        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
-                        if (!bh)
-                                return -EIO;
-                        if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
-                                break;
-                }
                break;
-          }
        }
-        brelse(bh);
+        kfree(data);
        return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba..6ee6ad20acf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
@@ -66,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
        INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
        mutex_init(&HFSPLUS_I(inode)->extents_lock);
        HFSPLUS_I(inode)->flags = 0;
+        HFSPLUS_I(inode)->extent_state = 0;
        HFSPLUS_I(inode)->rsrc_inode = NULL;
        atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
@@ -157,45 +159,65 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_vh *vhdr = sbi->s_vhdr;
+        int write_backup = 0;
+        int error, error2;
+        if (!wait)
+                return 0;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
-        mutex_lock(&sbi->vh_mutex);
-        mutex_lock(&sbi->alloc_mutex);
        sb->s_dirt = 0;
+        /*
+         * Explicitly write out the special metadata inodes.
+         *
+         * While these special inodes are marked as hashed and written
+         * out peridocically by the flusher threads we redirty them
+         * during writeout of normal inodes, and thus the life lock
+         * prevents us from getting the latest state to disk.
+         */
+        error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+        error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+        if (!error)
+                error = error2;
+        error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+        if (!error)
+                error = error2;
+        mutex_lock(&sbi->vh_mutex);
+        mutex_lock(&sbi->alloc_mutex);
        vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
        vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
        vhdr->folder_count = cpu_to_be32(sbi->folder_count);
        vhdr->file_count = cpu_to_be32(sbi->file_count);
-        mark_buffer_dirty(sbi->s_vhbh);
        if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
-                if (sbi->sect_count) {
+                memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
-                        struct buffer_head *bh;
+                write_backup = 1;
-                        u32 block, offset;
-                        block = sbi->blockoffset;
-                        block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
-                        offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
-                        printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
-                                          sbi->blockoffset, sbi->sect_count,
-                                          block, offset);
-                        bh = sb_bread(sb, block);
-                        if (bh) {
-                                vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
-                                if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-                                        memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
-                                        mark_buffer_dirty(bh);
-                                        brelse(bh);
-                                } else
-                                        printk(KERN_WARNING "hfs: backup not found!\n");
-                        }
-                }
        }
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, WRITE_SYNC);
+        if (!error)
+                error = error2;
+        if (!write_backup)
+                goto out;
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                  sbi->part_start + sbi->sect_count - 2,
+                                  sbi->s_backup_vhdr, WRITE_SYNC);
+        if (!error)
+                error2 = error;
+out:
        mutex_unlock(&sbi->alloc_mutex);
        mutex_unlock(&sbi->vh_mutex);
-        return 0;
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static void hfsplus_write_super(struct super_block *sb)
@@ -215,23 +237,22 @@ static void hfsplus_put_super(struct super_block *sb)
        if (!sb->s_fs_info)
                return;
-        if (sb->s_dirt)
-                hfsplus_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
                struct hfsplus_vh *vhdr = sbi->s_vhdr;
                vhdr->modify_date = hfsp_now2mt();
                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-                mark_buffer_dirty(sbi->s_vhbh);
-                sync_dirty_buffer(sbi->s_vhbh);
+                hfsplus_sync_fs(sb, 1);
        }
        hfs_btree_close(sbi->cat_tree);
        hfs_btree_close(sbi->ext_tree);
        iput(sbi->alloc_file);
        iput(sbi->hidden_dir);
-        brelse(sbi->s_vhbh);
+        kfree(sbi->s_vhdr);
+        kfree(sbi->s_backup_vhdr);
        unload_nls(sbi->nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
@@ -263,26 +284,31 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
                return 0;
        if (!(*flags & MS_RDONLY)) {
                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
-                struct hfsplus_sb_info sbi;
+                int force = 0;
-                memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
+                if (!hfsplus_parse_options_remount(data, &force))
-                sbi.nls = HFSPLUS_SB(sb)->nls;
-                if (!hfsplus_parse_options(data, &sbi))
                        return -EINVAL;
                if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                        printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
+                        printk(KERN_WARNING "hfs: filesystem was "
-                               "running fsck.hfsplus is recommended.  leaving read-only.\n");
+                                        "not cleanly unmounted, "
+                                        "running fsck.hfsplus is recommended.  "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
+                } else if (force) {
                        /* nothing */
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                        printk(KERN_WARNING "hfs: filesystem is marked locked, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                        printk(KERN_WARNING "hfs: filesystem is "
+                                        "marked journaled, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
                }
@@ -372,17 +398,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
+                printk(KERN_WARNING "hfs: Filesystem was "
-                       "running fsck.hfsplus is recommended.  mounting read-only.\n");
+                                "not cleanly unmounted, "
+                                "running fsck.hfsplus is recommended.  "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
        } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                /* nothing */
        } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
-        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
+        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
-                printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
+                        !(sb->s_flags & MS_RDONLY)) {
-                       "use the force option at your own risk, mounting read-only.\n");
+                printk(KERN_WARNING "hfs: write access to "
+                                "a journaled filesystem is not supported, "
+                                "use the force option at your own risk, "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
        }
@@ -419,7 +450,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                err = -ENOMEM;
                goto cleanup;
        }
-        sb->s_root->d_op = &hfsplus_dentry_operations;
+        d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
@@ -449,19 +480,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        be32_add_cpu(&vhdr->write_count, 1);
        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-        mark_buffer_dirty(sbi->s_vhbh);
+        hfsplus_sync_fs(sb, 1);
-        sync_dirty_buffer(sbi->s_vhbh);
        if (!sbi->hidden_dir) {
-                printk(KERN_DEBUG "hfs: create hidden dir...\n");
                mutex_lock(&sbi->vh_mutex);
                sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
                hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
                                   &str, sbi->hidden_dir);
                mutex_unlock(&sbi->vh_mutex);
-                mark_inode_dirty(sbi->hidden_dir);
+                hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
        }
 out:
        unload_nls(sbi->nls);
@@ -488,11 +516,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfsplus_destroy_inode(struct inode *inode)
+static void hfsplus_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882..a3f0bfcc881 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
 /* Returns folded char, or 0 if ignorable */
 static inline u16 case_fold(u16 c)
 {
-        u16 tmp;
+        u16 tmp;
-        tmp = hfsplus_case_fold_table[c >> 8];
+        tmp = hfsplus_case_fold_table[c >> 8];
-        if (tmp)
+        if (tmp)
-                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
-        else
+        else
-                tmp = c;
+                tmp = c;
-        return tmp;
+        return tmp;
 }
 /* Compare unicode strings, return values like normal strcmp */
@@ -118,7 +118,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
        return NULL;
 }
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
+int hfsplus_uni2asc(struct super_block *sb,
+                const struct hfsplus_unistr *ustr,
+                char *astr, int *len_p)
 {
        const hfsplus_unichr *ip;
        struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto same;
                        c1 = be16_to_cpu(*ip);
                        if (likely(compose))
-                                ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1);
+                                ce1 = hfsplus_compose_lookup(
+                                        hfsplus_compose_table, c1);
                        if (ce1)
                                break;
                        switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                if (ce2) {
                        i = 1;
                        while (i < ustrlen) {
-                                ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i]));
+                                ce1 = hfsplus_compose_lookup(ce2,
+                                        be16_to_cpu(ip[i]));
                                if (!ce1)
                                        break;
                                i++;
@@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto done;
                        }
                }
-        same:
+same:
                switch (c0) {
                case 0:
                        cc = 0x2400;
@@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                default:
                        cc = c0;
                }
-        done:
+done:
                res = nls->uni2char(cc, op, len);
                if (res < 0) {
                        if (res == -ENAMETOOLONG)
@@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        struct super_block *sb = dentry->d_sb;
        const char *astr;
@@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct super_block *sb = dentry->d_sb;
+        struct super_block *sb = parent->d_sb;
        int casefold, decompose, size;
        int dsize1, dsize2, len1, len2;
        const u16 *dstr1, *dstr2;
@@ -375,10 +383,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
        casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-        astr1 = s1->name;
+        astr1 = str;
-        len1 = s1->len;
+        len1 = len;
-        astr2 = s2->name;
+        astr2 = name->name;
-        len2 = s2->len;
+        len2 = name->len;
        dsize1 = dsize2 = 0;
        dstr1 = dstr2 = NULL;
@@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr1 += size;
                        len1 -= size;
-                        if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
+                        if (decompose)
+                                dstr1 = decompose_unichar(c, &dsize1);
+                        if (!decompose || !dstr1) {
                                c1 = c;
                                dstr1 = &c1;
                                dsize1 = 1;
@@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr2 += size;
                        len2 -= size;
-                        if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
+                        if (decompose)
+                                dstr2 = decompose_unichar(c, &dsize2);
+                        if (!decompose || !dstr2) {
                                c2 = c;
                                dstr2 = &c2;
                                dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 8972c20b321..196231794f6 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
        u16 embed_count;
 };
+static void hfsplus_end_io_sync(struct bio *bio, int err)
+{
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        complete(bio->bi_private);
+}
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
+                void *data, int rw)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOIO, 1);
+        bio->bi_sector = sector;
+        bio->bi_bdev = bdev;
+        bio->bi_end_io = hfsplus_end_io_sync;
+        bio->bi_private = &wait;
+        /*
+         * We always submit one sector at a time, so bio_add_page must not fail.
+         */
+        if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
+                         offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
+                BUG();
+        submit_bio(rw, bio);
+        wait_for_completion(&wait);
+        if (!bio_flagged(bio, BIO_UPTODATE))
+                return -EIO;
+        return 0;
+}
 static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 {
        u32 extent;
@@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
           !(attrib & HFSP_WRAP_ATTRIB_SPARED))
                return 0;
-        wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+        wd->ablk_size =
+                be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
        if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
                return 0;
        if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
                return 0;
-        wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+        wd->ablk_start =
+                be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
        extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
        wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -68,7 +104,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
        if (HFSPLUS_SB(sb)->session >= 0) {
                te.cdte_track = HFSPLUS_SB(sb)->session;
                te.cdte_format = CDROM_LBA;
-                res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+                res = ioctl_by_bdev(sb->s_bdev,
+                        CDROMREADTOCENTRY, (unsigned long)&te);
                if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
                        *start = (sector_t)te.cdte_addr.lba << 2;
                        return 0;
@@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
                return -EINVAL;
        }
        ms_info.addr_format = CDROM_LBA;
-        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+                (unsigned long)&ms_info);
        if (!res && ms_info.xa_flag)
                *start = (sector_t)ms_info.addr.lba << 2;
        return 0;
@@ -88,100 +126,112 @@ static int hfsplus_get_last_session(struct super_block *sb,
 int hfsplus_read_wrapper(struct super_block *sb)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
-        struct buffer_head *bh;
-        struct hfsplus_vh *vhdr;
        struct hfsplus_wd wd;
        sector_t part_start, part_size;
        u32 blocksize;
+        int error = 0;
+        error = -EINVAL;
        blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
        if (!blocksize)
-                return -EINVAL;
+                goto out;
        if (hfsplus_get_last_session(sb, &part_start, &part_size))
-                return -EINVAL;
+                goto out;
        if ((u64)part_start + part_size > 0x100000000ULL) {
                pr_err("hfs: volumes larger than 2TB are not supported yet\n");
-                return -EINVAL;
+                goto out;
        }
-        while (1) {
-                bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                if (!bh)
-                        return -EIO;
-                if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
-                        if (!hfsplus_read_mdb(vhdr, &wd))
-                                goto error;
-                        wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
-                        part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
-                        part_size = wd.embed_count * wd.ablk_size;
-                        brelse(bh);
-                        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                        if (!bh)
-                                return -EIO;
-                }
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-                        break;
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-                        set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
-                        break;
-                }
-                brelse(bh);
-                /* check for a partition block
+        error = -ENOMEM;
+        sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_vhdr)
+                goto out;
+        sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_backup_vhdr)
+                goto out_free_vhdr;
+reread:
+        error = hfsplus_submit_bio(sb->s_bdev,
+                                   part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        switch (sbi->s_vhdr->signature) {
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+                set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+                /*FALLTHRU*/
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+                break;
+        case cpu_to_be16(HFSP_WRAP_MAGIC):
+                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+                        goto out;
+                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+                part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
+                part_size = wd.embed_count * wd.ablk_size;
+                goto reread;
+        default:
+                /*
+                 * Check for a partition block.
+                 *
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
-                        return -EINVAL;
+                        goto out;
+                goto reread;
+        }
+        error = hfsplus_submit_bio(sb->s_bdev,
+                                   part_start + part_size - 2,
+                                   sbi->s_backup_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+                printk(KERN_WARNING
+                        "hfs: invalid secondary volume header\n");
+                goto out_free_backup_vhdr;
        }
-        blocksize = be32_to_cpu(vhdr->blocksize);
+        blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
-        brelse(bh);
-        /* block size must be at least as large as a sector
+        /*
-         * and a multiple of 2
+         * Block size must be at least as large as a sector and a multiple of 2.
         */
-        if (blocksize < HFSPLUS_SECTOR_SIZE ||
+        if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
-            ((blocksize - 1) & blocksize))
+                goto out_free_backup_vhdr;
-                return -EINVAL;
        sbi->alloc_blksz = blocksize;
        sbi->alloc_blksz_shift = 0;
        while ((blocksize >>= 1) != 0)
                sbi->alloc_blksz_shift++;
        blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
-        /* align block size to block offset */
+        /*
+         * Align block size to block offset.
+         */
        while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
                blocksize >>= 1;
        if (sb_set_blocksize(sb, blocksize) != blocksize) {
-                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
+                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
-                return -EINVAL;
+                        blocksize);
+                goto out_free_backup_vhdr;
        }
        sbi->blockoffset =
                part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+        sbi->part_start = part_start;
        sbi->sect_count = part_size;
        sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
-        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-        if (!bh)
-                return -EIO;
-        /* should still be the same... */
-        if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
-                if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
-                        goto error;
-        } else {
-                if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-                        goto error;
-        }
-        sbi->s_vhbh = bh;
-        sbi->s_vhdr = vhdr;
        return 0;
- error:
-        brelse(bh);
+out_free_backup_vhdr:
-        return -EINVAL;
+        kfree(sbi->s_backup_vhdr);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+out:
+        return error;
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e..d3244d949a4 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
-        char *p = __dentry_path(dentry, name, PATH_MAX);
+        char *p = dentry_path_raw(dentry, name, PATH_MAX);
        char *root;
        size_t len;
-        spin_unlock(&dcache_lock);
        root = dentry->d_sb->s_fs_info;
        len = strlen(root);
        if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
        if (!name)
                return NULL;
-        spin_lock(&dcache_lock);
        return __dentry_name(dentry, name); /* will unlock */
 }
 static char *inode_name(struct inode *ino)
 {
        struct dentry *dentry;
-        char *name = __getname();
+        char *name;
-        if (!name)
-                return NULL;
-        spin_lock(&dcache_lock);
+        dentry = d_find_alias(ino);
-        if (list_empty(&ino->i_dentry)) {
+        if (!dentry)
-                spin_unlock(&dcache_lock);
-                __putname(name);
                return NULL;
-        }
-        dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+        name = dentry_name(dentry);
-        return __dentry_name(dentry, name); /* will unlock */
+        dput(dentry);
+        return name;
 }
 static char *follow_link(char *link)
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
        }
 }
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HOSTFS_I(inode));
 }
+static void hostfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,7 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out_put;
        d_add(dentry, inode);
-        dentry->d_op = &hostfs_dentry_ops;
+        d_set_d_op(dentry, &hostfs_dentry_ops);
        return NULL;
 out_put:
@@ -746,11 +749,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (desired & MAY_READ) r = 1;
        if (desired & MAY_WRITE) w = 1;
        if (desired & MAY_EXEC) x = 1;
@@ -765,7 +771,7 @@ int hostfs_permission(struct inode *ino, int desired)
                err = access_file(name, r, w, x);
        __putname(name);
        if (!err)
-                err = generic_permission(ino, desired, NULL);
+                err = generic_permission(ino, desired, flags, NULL);
        return err;
 }
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5..32c13a94e1e 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
 * Note: the dentry argument is the parent dentry.
 */
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        unsigned long    hash;
        int              i;
@@ -34,19 +35,25 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
        return 0;
 }
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        unsigned al=a->len;
+        unsigned al = len;
-        unsigned bl=b->len;
+        unsigned bl = name->len;
-        hpfs_adjust_length(a->name, &al);
+        hpfs_adjust_length(str, &al);
        /*hpfs_adjust_length(b->name, &bl);*/
-        /* 'a' is the qstr of an already existing dentry, so the name
-         * must be valid. 'b' must be validated first.
+        /*
+         * 'str' is the nane of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (hpfs_chk_name(b->name, &bl))
+        if (hpfs_chk_name(name->name, &bl))
                return 1;
-        if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+        if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
                return 1;
        return 0;
 }
@@ -58,5 +65,5 @@ static const struct dentry_operations hpfs_dentry_operations = {
 void hpfs_set_dentry_operations(struct dentry *dentry)
 {
-        dentry->d_op = &hpfs_dentry_operations;
+        d_set_d_op(dentry, &hpfs_dentry_operations);
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f6..f4ad9e31ddc 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
                        unlock_kernel();
                        return -ENOSPC;
                }
-                if (generic_permission(inode, MAY_WRITE, NULL) ||
+                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3..49935ba78db 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
+static void hpfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713f..87ed48e0343 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
        mntput(ino->i_sb->s_fs_info);
 }
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HPPFS_I(inode));
 }
+static void hppfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a5fe68189ee..9885082b470 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
        mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+        call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index ae2727ab0c3..da85e56378f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,26 +102,29 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes);
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_inodes, i);
+        return sum < 0 ? 0 : sum;
 }
 static inline int get_nr_inodes_unused(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes_unused);
+        return inodes_stat.nr_unused;
 }
 int get_nr_dirty_inodes(void)
 {
+        /* not actually dirty inodes, but a wild approximation */
        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
 }
 /*
@@ -132,7 +135,6 @@ int proc_nr_inodes(ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        inodes_stat.nr_inodes = get_nr_inodes();
-        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -224,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
-        percpu_counter_inc(&nr_inodes);
+        this_cpu_inc(nr_inodes);
        return 0;
 out:
@@ -255,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
        return inode;
 }
+void free_inode_nonrcu(struct inode *inode)
+{
+        kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
@@ -266,10 +274,17 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
-        percpu_counter_dec(&nr_inodes);
+        this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
+static void i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(inode_cachep, inode);
+}
 static void destroy_inode(struct inode *inode)
 {
        BUG_ON(!list_empty(&inode->i_lru));
@@ -277,7 +292,7 @@ static void destroy_inode(struct inode *inode)
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
-                kmem_cache_free(inode_cachep, (inode));
+                call_rcu(&inode->i_rcu, i_callback);
 }
 /*
@@ -335,7 +350,7 @@ static void inode_lru_list_add(struct inode *inode)
 {
        if (list_empty(&inode->i_lru)) {
                list_add(&inode->i_lru, &inode_lru);
-                percpu_counter_inc(&nr_inodes_unused);
+                inodes_stat.nr_unused++;
        }
 }
@@ -343,7 +358,7 @@ static void inode_lru_list_del(struct inode *inode)
 {
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
 }
@@ -430,6 +445,7 @@ void end_writeback(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
+        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
@@ -513,7 +529,7 @@ void evict_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -554,7 +570,7 @@ int invalidate_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -616,7 +632,7 @@ static void prune_icache(int nr_to_scan)
                if (atomic_read(&inode->i_count) ||
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
                        continue;
                }
@@ -650,7 +666,7 @@ static void prune_icache(int nr_to_scan)
                 */
                list_move(&inode->i_lru, &freeable);
                list_del_init(&inode->i_wb_list);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1664,6 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
-        percpu_counter_init(&nr_inodes, 0);
-        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4..9687c2ee273 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53b..844a7903c72 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,16 +26,32 @@
 #define BEQUIET
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #endif
 static void isofs_put_super(struct super_block *sb)
@@ -65,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
+static void isofs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, isofs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct iso_inode_info *ei = foo;
@@ -160,7 +183,7 @@ struct iso9660_options{
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -181,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -206,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
- */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
-                                struct qstr *b, int ms)
-{
-        int alen, blen;
-        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
-        blen = b->len;
-        if (ms) {
-                while (alen && a->name[alen-1] == '.')
-                        alen--;
-                while (blen && b->name[blen-1] == '.')
-                        blen--;
-        }
-        if (alen == blen) {
-                if (strnicmp(a->name, b->name, alen) == 0)
-                        return 0;
-        }
-        return 1;
-}
-/*
- * Case sensitive compare of two isofs names.
 */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
+static int isofs_dentry_cmp_common(
-                                        struct qstr *b, int ms)
+                unsigned int len, const char *str,
+                const struct qstr *name, int ms, int ci)
 {
        int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
+        alen = name->len;
-        blen = b->len;
+        blen = len;
        if (ms) {
-                while (alen && a->name[alen-1] == '.')
+                while (alen && name->name[alen-1] == '.')
                        alen--;
-                while (blen && b->name[blen-1] == '.')
+                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (ci) {
-                        return 0;
+                        if (strnicmp(name->name, str, alen) == 0)
+                                return 0;
+                } else {
+                        if (strncmp(name->name, str, alen) == 0)
+                                return 0;
+                }
        }
        return 1;
 }
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 0);
 }
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 0);
 }
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 1);
 }
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 1);
 }
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
@@ -932,7 +949,7 @@ root_found:
                table += 2;
        if (opt.check == 'r')
                table++;
-        s->s_root->d_op = &isofs_dentry_ops[table];
+        d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
        kfree(opt.iocharset);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd428..679a849c3b2 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
        qstr.name = compare;
        qstr.len = dlen;
-        return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+        return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+                        dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 /*
@@ -171,7 +172,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        struct inode *inode;
        struct page *page;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        page = alloc_page(GFP_USER);
        if (!page)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f837ba95352..9e4686900f1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
-                goto fail;
+                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err) {
                kfree(journal);
-                goto fail;
+                return NULL;
        }
        spin_lock_init(&journal->j_history_lock);
        return journal;
-fail:
-        return NULL;
 }
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 #endif
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 static int __init journal_init_handle_cache(void)
 {
-        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-                                sizeof(handle_t),
-                                0,              /* offset */
-                                SLAB_TEMPORARY, /* flags */
-                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
-                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+        if (jbd2_inode_cache == NULL) {
+                printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+                kmem_cache_destroy(jbd2_handle_cache);
                return -ENOMEM;
        }
        return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
        if (jbd2_handle_cache)
                kmem_cache_destroy(jbd2_handle_cache);
+        if (jbd2_inode_cache)
+                kmem_cache_destroy(jbd2_inode_cache);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f..1cad869494f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c7934900dcd..faad2bd787c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -774,7 +772,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                /* Fire data frozen trigger just before we copy the data */
                jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bb..95b79672150 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int rc;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d954..3119f59253d 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a..853b8e30008 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
        return &f->vfs_inode;
 }
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
+static void jffs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
 static void jffs2_i_init_once(void *foo)
 {
        struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4..e5de9422fa3 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
        return rc;
 }
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878..f9285c4900f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 231ca4af9bc..4414e3a4226 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
 */
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/quotaops.h>
 #include <linux/exportfs.h>
@@ -1465,7 +1466,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                dentry->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(dentry, &jfs_ci_dentry_operations);
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
@@ -1494,7 +1495,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        dentry = d_splice_alias(ip, dentry);
        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-                dentry->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(dentry, &jfs_ci_dentry_operations);
        return dentry;
 }
@@ -1573,7 +1574,8 @@ const struct file_operations jfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+                struct qstr *this)
 {
        unsigned long hash;
        int i;
@@ -1586,32 +1588,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
        return 0;
 }
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i, result = 1;
-        if (a->len != b->len)
+        if (len != name->len)
                goto out;
-        for (i=0; i < a->len; i++) {
+        for (i=0; i < len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
+                if (tolower(str[i]) != tolower(name->name[i]))
                        goto out;
        }
        result = 0;
+out:
+        return result;
+}
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
-         * We want creates to preserve case.  A negative dentry, a, that
+         * This is not negative dentry. Always valid.
-         * has a different case than b may cause a new entry to be created
+         *
-         * with the wrong case.  Since we can't tell if a comes from a negative
+         * Note, rename() to existing directory entry will have ->d_inode,
-         * dentry, we blindly replace it with b.  This should be harmless if
+         * and will use existing name which isn't specified name by user.
-         * a is not a negative dentry.
+         *
+         * We may be able to drop this positive dentry here. But dropping
+         * positive dentry isn't good idea. So it's unsupported like
+         * rename("filename", "FILENAME") for now.
         */
-        memcpy((unsigned char *)a->name, b->name, a->len);
+        if (dentry->d_inode)
-out:
+                return 1;
-        return result;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        return 1;
 }
 const struct dentry_operations jfs_ci_dentry_operations =
 {
        .d_hash = jfs_ci_hash,
        .d_compare = jfs_ci_compare,
+        .d_revalidate = jfs_ci_revalidate,
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3b..3150d766e0d 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
        return &jfs_inode->vfs_inode;
 }
+static void jfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(jfs_inode_cachep, ji);
+}
 static void jfs_destroy_inode(struct inode *inode)
 {
        struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-        kmem_cache_free(jfs_inode_cachep, ji);
+        call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -517,7 +525,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_no_root;
        if (sbi->mntflag & JFS_OS2)
-                sb->s_root->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
        /* logical blocks are represented by 40 bits in pxd_t, etc. */
        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528a..889311e3d06 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
 #include <asm/uaccess.h>
+static inline int simple_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
                   struct kstat *stat)
 {
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
        return 1;
 }
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        dentry->d_op = &simple_dentry_operations;
+        d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
 }
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-        mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = file->f_path.dentry;
+        mutex_lock(&dentry->d_inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        if (offset >= 0)
                                break;
                default:
-                        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        struct dentry *cursor = file->private_data;
                        loff_t n = file->f_pos - 2;
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
+                        /* d_lock not required for cursor */
                        list_del(&cursor->d_u.d_child);
-                        p = file->f_path.dentry->d_subdirs.next;
+                        p = dentry->d_subdirs.next;
-                        while (n && p != &file->f_path.dentry->d_subdirs) {
+                        while (n && p != &dentry->d_subdirs) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (!d_unhashed(next) && next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (simple_positive(next))
                                        n--;
+                                spin_unlock(&next->d_lock);
                                p = p->next;
                        }
                        list_add_tail(&cursor->d_u.d_child, p);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                }
        }
-        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        return offset;
 }
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        if (filp->f_pos == 2)
                                list_move(q, &dentry->d_subdirs);
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (d_unhashed(next) || !next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (!simple_positive(next)) {
+                                        spin_unlock(&next->d_lock);
                                        continue;
+                                }
-                                spin_unlock(&dcache_lock);
+                                spin_unlock(&next->d_lock);
+                                spin_unlock(&dentry->d_lock);
                                if (filldir(dirent, next->d_name.name, 
                                            next->d_name.len, filp->f_pos, 
                                            next->d_inode->i_ino, 
                                            dt_type(next->d_inode)) < 0)
                                        return 0;
-                                spin_lock(&dcache_lock);
+                                spin_lock(&dentry->d_lock);
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
                                /* next is still alive */
                                list_move(q, p);
+                                spin_unlock(&next->d_lock);
                                p = q;
                                filp->f_pos++;
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
        }
        return 0;
 }
@@ -259,23 +274,23 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        return 0;
 }
-static inline int simple_positive(struct dentry *dentry)
-{
-        return dentry->d_inode && !d_unhashed(dentry);
-}
 int simple_empty(struct dentry *dentry)
 {
        struct dentry *child;
        int ret = 0;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
-                if (simple_positive(child))
+                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+                if (simple_positive(child)) {
+                        spin_unlock(&child->d_lock);
                        goto out;
+                }
+                spin_unlock(&child->d_lock);
+        }
        ret = 1;
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return ret;
 }
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab33..ca58d64374c 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 00000000000..f848b52c67b
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz            (0)
+#define NLM4_cookie_sz          (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz          (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz         (1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz            (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz          (6+NLM4_owner_sz)
+#define NLM4_testargs_sz        (NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz        (NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz        (NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz      (NLM4_cookie_sz+NLM4_lock_sz)
+#define NLM4_testres_sz         (NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz             (NLM4_cookie_sz+1)
+#define NLM4_norep_sz           (0)
+static s64 loff_t_to_s64(loff_t offset)
+{
+        s64 res;
+        if (offset >= NLM4_OFFSET_MAX)
+                res = NLM4_OFFSET_MAX;
+        else if (offset <= -NLM4_OFFSET_MAX)
+                res = -NLM4_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+                                 u64 *l_offset, u64 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s64(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                             struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+/*
+ *      enum nlm4_stats {
+ *              NLM4_GRANTED = 0,
+ *              NLM4_DENIED = 1,
+ *              NLM4_DENIED_NOLOCKS = 2,
+ *              NLM4_BLOCKED = 3,
+ *              NLM4_DENIED_GRACE_PERIOD = 4,
+ *              NLM4_DEADLCK = 5,
+ *              NLM4_ROFS = 6,
+ *              NLM4_STALE_FH = 7,
+ *              NLM4_FBIG = 8,
+ *              NLM4_FAILED = 9
+ *      };
+ *
+ *      struct nlm4_stat {
+ *              nlm4_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+                             const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm4_failed))
+                goto out_bad_xdr;
+        *stat = *p;
+        return 0;
+out_bad_xdr:
+        dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+                        __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm4_holder {
+ *              bool    exclusive;
+ *              int32   svid;
+ *              netobj  oh;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+                               const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u64 l_offset, l_len;
+        u32 exclusive;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 8 + 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        p = xdr_decode_hyper(p, &l_offset);
+        xdr_decode_hyper(p, &l_len);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm4_lock {
+ *              string  caller_name<LM_MAXSTRLEN>;
+ *              netobj  fh;
+ *              netobj  oh;
+ *              int32   svid;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+                             const struct nlm_lock *lock)
+{
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 8 + 8);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm4_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm4_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_unlockargs {
+ *              netobj cookie;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+}
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+        if (result->status == nlm_lck_denied)
+                encode_nlm4_holder(xdr, result);
+}
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm4_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm4_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep      NULL
+#define PROC(proc, argtype, restype)                                    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,             \
+        .p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,             \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm4_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+struct rpc_version      nlm_version4 = {
+        .number         = 4,
+        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
+        .procs          = nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd..8d4ea8351e3 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
 */
 void nlmclnt_done(struct nlm_host *host)
 {
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e..adb45ec9038 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
                return;
        list_del(&lockowner->list);
        spin_unlock(&lockowner->host->h_lock);
-        nlm_release_host(lockowner->host);
+        nlmclnt_release_host(lockowner->host);
        kfree(lockowner);
 }
@@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        return NULL;
 }
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
        if (!atomic_dec_and_test(&call->a_count))
                return;
-        nlm_release_host(call->a_host);
+        nlmclnt_release_host(call->a_host);
        nlmclnt_release_lockargs(call);
        kfree(call);
 }
 static void nlmclnt_rpc_release(void *data)
 {
-        nlm_release_call(data);
+        nlmclnt_release_call(data);
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                        status = nlm_stat_to_errno(req->a_res.status);
        }
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -593,7 +593,7 @@ again:
 out_unblock:
        nlmclnt_finish_block(block);
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 out_unlock:
        /* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
                        NLMPROC_CANCEL, &nlmclnt_cancel_ops);
        if (status == 0 && req->a_res.status == nlm_lck_denied)
                status = -ENOLCK;
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 00000000000..180ac34feb9
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz           (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz            (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz          (1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz             (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz           (4+NLM_owner_sz)
+#define NLM_testargs_sz         (NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz         (NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz         (NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz       (NLM_cookie_sz+NLM_lock_sz)
+#define NLM_testres_sz          (NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz              (NLM_cookie_sz+1)
+#define NLM_norep_sz            (0)
+static s32 loff_t_to_s32(loff_t offset)
+{
+        s32 res;
+        if (offset >= NLM_OFFSET_MAX)
+                res = NLM_OFFSET_MAX;
+        else if (offset <= -NLM_OFFSET_MAX)
+                res = -NLM_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+                                u32 *l_offset, u32 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s32(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                         struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+/*
+ *      enum nlm_stats {
+ *              LCK_GRANTED = 0,
+ *              LCK_DENIED = 1,
+ *              LCK_DENIED_NOLOCKS = 2,
+ *              LCK_BLOCKED = 3,
+ *              LCK_DENIED_GRACE_PERIOD = 4
+ *      };
+ *
+ *
+ *      struct nlm_stat {
+ *              nlm_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm_stat(struct xdr_stream *xdr,
+                            const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm_stat(struct xdr_stream *xdr,
+                           __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm_lck_denied_grace_period))
+                goto out_enum;
+        *stat = *p;
+        return 0;
+out_enum:
+        dprintk("%s: server returned invalid nlm_stats value: %u\n",
+                __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm_holder {
+ *              bool exclusive;
+ *              int uppid;
+ *              netobj oh;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+                              const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u32 exclusive, l_offset, l_len;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        l_offset = be32_to_cpup(p++);
+        l_len = be32_to_cpup(p);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm_lock {
+ *              string caller_name<LM_MAXSTRLEN>;
+ *              netobj fh;
+ *              netobj oh;
+ *              int uppid;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+                            const struct nlm_lock *lock)
+{
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_unlockargs {
+ *              netobj cookie;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+}
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        if (result->status == nlm_lck_denied)
+                encode_nlm_holder(xdr, result);
+}
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+        encode_nlm_testrply(xdr, result);
+}
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+                               struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+                           struct xdr_stream *xdr,
+                           struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep       NULL
+#define PROC(proc, argtype, restype)    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,              \
+        .p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,              \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+static struct rpc_version       nlm_version1 = {
+                .number         = 1,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       nlm_version3 = {
+                .number         = 3,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       *nlm_versions[] = {
+        [1] = &nlm_version1,
+        [3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+        [4] = &nlm_version4,
+#endif
+};
+static struct rpc_stat          nlm_rpc_stats;
+struct rpc_program              nlm_program = {
+                .name           = "lockd",
+                .number         = NLM_PROGRAM,
+                .nrvers         = ARRAY_SIZE(nlm_versions),
+                .version        = nlm_versions,
+                .stats          = &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed0c59fe23c..5f1bcb2f06f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
-static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_client_hosts[NLM_HOST_NRHASH];
+#define for_each_host(host, pos, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry((host), (pos), (chain), h_hash)
+#define for_each_host_safe(host, pos, next, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry_safe((host), (pos), (next), \
+                                                (chain), h_hash)
 static unsigned long            next_gc;
-static int                      nrhosts;
+static unsigned long            nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
        const u32               version;        /* NLM version to search for */
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
-        const struct sockaddr   *src_sap;       /* our address (optional) */
-        const size_t            src_len;        /* it's length */
        const int               noresvport;     /* use non-priv port */
 };
@@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
 */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+                                       struct nsm_handle *nsm)
 {
-        struct hlist_head *chain;
+        struct nlm_host *host = NULL;
-        struct hlist_node *pos;
+        unsigned long now = jiffies;
-        struct nlm_host *host;
-        struct nsm_handle *nsm = NULL;
-        mutex_lock(&nlm_host_mutex);
-        if (time_after_eq(jiffies, next_gc))
+        if (nsm != NULL)
-                nlm_gc_hosts();
-        /* We may keep several nlm_host objects for a peer, because each
-         * nlm_host is identified by
-         * (address, protocol, version, server/client)
-         * We could probably simplify this a little by putting all those
-         * different NLM rpc_clients into one single nlm_host object.
-         * This would allow us to have one nlm_host per address.
-         */
-        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-                        continue;
-                /* See if we have an NSM handle for this client */
-                if (!nsm)
-                        nsm = host->h_nsmhandle;
-                if (host->h_proto != ni->protocol)
-                        continue;
-                if (host->h_version != ni->version)
-                        continue;
-                if (host->h_server != ni->server)
-                        continue;
-                if (ni->server && ni->src_len != 0 &&
-                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-                        continue;
-                /* Move to head of hash chain. */
-                hlist_del(&host->h_hash);
-                hlist_add_head(&host->h_hash, chain);
-                nlm_get_host(host);
-                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-                                host->h_name, host->h_addrbuf);
-                goto out;
-        }
-        /*
-         * The host wasn't in our hash table.  If we don't
-         * have an NSM handle for it yet, create one.
-         */
-        if (nsm)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
                nsm = nsm_get_handle(ni->sap, ni->salen,
                                        ni->hostname, ni->hostname_len);
-                if (!nsm) {
+                if (unlikely(nsm == NULL)) {
-                        dprintk("lockd: nlm_lookup_host failed; "
+                        dprintk("lockd: %s failed; no nsm handle\n",
-                                "no nsm handle\n");
+                                __func__);
                        goto out;
                }
        }
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = kmalloc(sizeof(*host), GFP_KERNEL);
-        if (!host) {
+        if (unlikely(host == NULL)) {
+                dprintk("lockd: %s failed; no memory\n", __func__);
                nsm_release(nsm);
-                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
-        host->h_name       = nsm->sm_name;
-        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addrlen = ni->salen;
+        host->h_addrlen    = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
-        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = 0;
-        host->h_srcaddrlen = ni->src_len;
+        host->h_rpcclnt    = NULL;
+        host->h_name       = nsm->sm_name;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
-        host->h_rpcclnt    = NULL;
+        host->h_reclaiming = 0;
-        mutex_init(&host->h_mutex);
+        host->h_server     = ni->server;
-        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+        host->h_noresvport = ni->noresvport;
-        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
+        host->h_inuse      = 0;
-        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
        init_rwsem(&host->h_rwsem);
-        host->h_state      = 0;                 /* pseudo NSM state */
+        host->h_state      = 0;
-        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmstate   = 0;
-        host->h_nsmhandle  = nsm;
+        host->h_pidcount   = 0;
-        host->h_server     = ni->server;
+        atomic_set(&host->h_count, 1);
-        host->h_noresvport = ni->noresvport;
+        mutex_init(&host->h_mutex);
-        hlist_add_head(&host->h_hash, chain);
+        host->h_nextrebind = now + NLM_HOST_REBIND;
+        host->h_expires    = now + NLM_HOST_EXPIRE;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
        INIT_LIST_HEAD(&host->h_reclaim);
+        host->h_nsmhandle  = nsm;
-        nrhosts++;
+        host->h_addrbuf    = nsm->sm_addrbuf;
-        dprintk("lockd: nlm_lookup_host created host %s\n",
-                        host->h_name);
 out:
-        mutex_unlock(&nlm_host_mutex);
        return host;
 }
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
 */
-static void
+static void nlm_destroy_host_locked(struct nlm_host *host)
-nlm_destroy_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
+        dprintk("lockd: destroy host %s\n", host->h_name);
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
+        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
        nsm_release(host->h_nsmhandle);
@@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
        if (clnt != NULL)
                rpc_shutdown_client(clnt);
        kfree(host);
+        nrhosts--;
 }
 /**
@@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .noresvport     = noresvport,
        };
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
                        (hostname ? hostname : "<none>"), version,
                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
-        return nlm_lookup_host(&ni);
+        mutex_lock(&nlm_host_mutex);
+        chain = &nlm_client_hosts[nlm_hash_address(sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != protocol)
+                        continue;
+                if (host->h_version != version)
+                        continue;
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n", __func__,
+                        host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n", __func__,
+                host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release client host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(host->h_server);
+        if (atomic_dec_and_test(&host->h_count)) {
+                BUG_ON(!list_empty(&host->h_lockowners));
+                BUG_ON(!list_empty(&host->h_granted));
+                BUG_ON(!list_empty(&host->h_reclaim));
+                mutex_lock(&nlm_host_mutex);
+                nlm_destroy_host_locked(host);
+                mutex_unlock(&nlm_host_mutex);
+        }
 }
 /**
@@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                                    const char *hostname,
                                    const size_t hostname_len)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host = NULL;
+        struct nsm_handle *nsm = NULL;
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
        };
        struct sockaddr_in6 sin6 = {
                .sin6_family    = AF_INET6,
        };
+        struct sockaddr *src_sap;
+        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
-                .src_len        = rqstp->rq_addrlen,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
                        (int)hostname_len, hostname, rqstp->rq_vers,
                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        mutex_lock(&nlm_host_mutex);
        switch (ni.sap->sa_family) {
        case AF_INET:
                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                ni.src_sap = (struct sockaddr *)&sin;
+                src_sap = (struct sockaddr *)&sin;
                break;
        case AF_INET6:
                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                ni.src_sap = (struct sockaddr *)&sin6;
+                src_sap = (struct sockaddr *)&sin6;
                break;
        default:
-                return NULL;
+                dprintk("lockd: %s failed; unrecognized address family\n",
+                        __func__);
+                goto out;
+        }
+        if (time_after_eq(jiffies, next_gc))
+                nlm_gc_hosts();
+        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != ni.protocol)
+                        continue;
+                if (host->h_version != ni.version)
+                        continue;
+                if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+                        continue;
+                /* Move to head of hash chain. */
+                hlist_del(&host->h_hash);
+                hlist_add_head(&host->h_hash, chain);
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n",
+                        __func__, host->h_name, host->h_addrbuf);
+                goto out;
        }
-        return nlm_lookup_host(&ni);
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        memcpy(nlm_srcaddr(host), src_sap, src_len);
+        host->h_srcaddrlen = src_len;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n",
+                __func__, host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release server host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(!host->h_server);
+        atomic_dec(&host->h_count);
 }
 /*
@@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
        return host;
 }
-/*
+static struct nlm_host *next_host_state(struct hlist_head *cache,
- * Release NLM host after use
+                                        struct nsm_handle *nsm,
- */
+                                        const struct nlm_reboot *info)
-void nlm_release_host(struct nlm_host *host)
 {
-        if (host != NULL) {
+        struct nlm_host *host = NULL;
-                dprintk("lockd: release host %s\n", host->h_name);
+        struct hlist_head *chain;
-                BUG_ON(atomic_read(&host->h_count) < 0);
+        struct hlist_node *pos;
-                if (atomic_dec_and_test(&host->h_count)) {
-                        BUG_ON(!list_empty(&host->h_lockowners));
+        mutex_lock(&nlm_host_mutex);
-                        BUG_ON(!list_empty(&host->h_granted));
+        for_each_host(host, pos, chain, cache) {
-                        BUG_ON(!list_empty(&host->h_reclaim));
+                if (host->h_nsmhandle == nsm
+                    && host->h_nsmstate != info->state) {
+                        host->h_nsmstate = info->state;
+                        host->h_state++;
+                        nlm_get_host(host);
+                        goto out;
                }
        }
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
 }
 /**
@@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host)
 */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-        struct hlist_head *chain;
-        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
@@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
         * lock for this.
         * To avoid processing a host several times, we match the nsmstate.
         */
-again:  mutex_lock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                nlmsvc_free_host_resources(host);
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                nlmsvc_release_host(host);
-                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = info->state;
-                                host->h_state++;
-                                nlm_get_host(host);
-                                mutex_unlock(&nlm_host_mutex);
-                                if (host->h_server) {
-                                        /* We're server for this guy, just ditch
-                                         * all the locks he held. */
-                                        nlmsvc_free_host_resources(host);
-                                } else {
-                                        /* He's the server, initiate lock recovery. */
-                                        nlmclnt_recovery(host);
-                                }
-                                nlm_release_host(host);
-                                goto again;
-                        }
-                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+                nlmclnt_recovery(host);
+                nlmclnt_release_host(host);
+        }
        nsm_release(nsm);
 }
@@ -497,13 +589,11 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts) {
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                host->h_expires = jiffies - 1;
-                        host->h_expires = jiffies - 1;
+                if (host->h_rpcclnt) {
-                        if (host->h_rpcclnt) {
+                        rpc_shutdown_client(host->h_rpcclnt);
-                                rpc_shutdown_client(host->h_rpcclnt);
+                        host->h_rpcclnt = NULL;
-                                host->h_rpcclnt = NULL;
-                        }
                }
        }
@@ -512,15 +602,13 @@ nlm_shutdown_hosts(void)
        mutex_unlock(&nlm_host_mutex);
        /* complain if any hosts are left */
-        if (nrhosts) {
+        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-                dprintk("lockd: %d hosts left:\n", nrhosts);
+                dprintk("lockd: %lu hosts left:\n", nrhosts);
-                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        hlist_for_each_entry(host, pos, chain, h_hash) {
+                        dprintk("       %s (cnt %d use %d exp %ld)\n",
-                                dprintk("       %s (cnt %d use %d exp %ld)\n",
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_inuse, host->h_expires);
-                                        host->h_inuse, host->h_expires);
-                        }
                }
        }
 }
@@ -538,29 +626,22 @@ nlm_gc_hosts(void)
        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts)
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                host->h_inuse = 0;
-                        host->h_inuse = 0;
-        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
-                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
+                if (atomic_read(&host->h_count) || host->h_inuse
-                        if (atomic_read(&host->h_count) || host->h_inuse
+                 || time_before(jiffies, host->h_expires)) {
-                         || time_before(jiffies, host->h_expires)) {
+                        dprintk("nlm_gc_hosts skipping %s "
-                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
+                                "(cnt %d use %d exp %ld)\n",
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires);
-                                continue;
+                        continue;
-                        }
-                        dprintk("lockd: delete host %s\n", host->h_name);
-                        hlist_del_init(&host->h_hash);
-                        nlm_destroy_host(host);
-                        nrhosts--;
                }
+                nlm_destroy_host_locked(host);
        }
        next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c91894964..23d7451b293 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
 * Status Monitor wire protocol.
 */
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
        const u32 len = strlen(string);
        __be32 *p;
-        if (unlikely(len > SM_MAXSTRLEN))
+        BUG_ON(len > SM_MAXSTRLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, string, len);
-        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
 */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        return encode_nsm_string(xdr, argp->mon_name);
+        encode_nsm_string(xdr, argp->mon_name);
 }
 /*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
        __be32 *p;
-        status = encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, utsname()->nodename);
-        if (unlikely(status != 0))
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-                return status;
+        *p++ = cpu_to_be32(argp->prog);
-        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        *p++ = cpu_to_be32(argp->vers);
-        if (unlikely(p == NULL))
+        *p = cpu_to_be32(argp->proc);
-                return -EIO;
-        *p++ = htonl(argp->prog);
-        *p++ = htonl(argp->vers);
-        *p++ = htonl(argp->proc);
-        return 0;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
 * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
+        encode_mon_name(xdr, argp);
+        encode_my_id(xdr, argp);
-        status = encode_mon_name(xdr, argp);
-        if (unlikely(status != 0))
-                return status;
-        return encode_my_id(xdr, argp);
 }
 /*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 * by the NSMPROC_MON call. This information will be supplied in the
 * NLMPROC_SM_NOTIFY call.
 */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-        return 0;
 }
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                       const struct nsm_args *argp)
+                            const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        int status;
+        encode_priv(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        status = encode_mon_id(&xdr, argp);
-        if (unlikely(status))
-                return status;
-        return encode_priv(&xdr, argp);
 }
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                         const struct nsm_args *argp)
+                              const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mon_id(&xdr, argp);
 }
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
-                            struct nsm_res *resp)
+                                struct xdr_stream *xdr,
+                                struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->status = ntohl(*p++);
+        resp->status = be32_to_cpup(p++);
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
+        dprintk("lockd: %s status %d state %d\n",
-                        resp->status, resp->state);
+                __func__, resp->status, resp->state);
        return 0;
 }
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
-                        struct nsm_res *resp)
+                            struct xdr_stream *xdr,
+                            struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_inline_decode(&xdr, sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+        dprintk("lockd: %s state %d\n", __func__, resp->state);
        return 0;
 }
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo      nsm_procedures[] = {
 [NSMPROC_MON] = {
                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t)xdr_enc_mon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_mon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
                .p_statidx      = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 [NSMPROC_UNMON] = {
                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
                .p_statidx      = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d26119245..9a41fdc1951 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unlock(file, &argp->lock);
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_share_file(host, file, argp);
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unshare_file(host, file, argp);
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e..6e31695d046 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
@@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
        kfree(block);
 failed:
-        nlm_release_call(call);
+        nlmsvc_release_call(call);
        return NULL;
 }
@@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
        mutex_unlock(&file->f_mutex);
        nlmsvc_freegrantargs(block->b_call);
-        nlm_release_call(block->b_call);
+        nlmsvc_release_call(block->b_call);
        nlm_release_file(block->b_file);
        kfree(block->b_fl);
        kfree(block);
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac..d27aab11f32 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                dprintk("lockd: TEST          status %d vers %d\n",
                        ntohl(resp->status), rqstp->rq_vers);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
                        -task->tk_status);
 }
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+        if (!atomic_dec_and_test(&call->a_count))
+                return;
+        nlmsvc_release_host(call->a_host);
+        kfree(call);
+}
 static void nlmsvc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cb..964666c68a8 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s32                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm_encode_fh(p, &lock->fh))
-         || !(p = nlm_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM_OFFSET_MAX
-         || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        start = loff_t_to_s32(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-        *p++ = htonl(lock->svid);
-        *p++ = htonl(start);
-        *p++ = htonl(len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                s32                     start, len, end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                start = ntohl(*p++);
-                len = ntohl(*p++);
-                end = start + len - 1;
-                fl->fl_start = s32_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s32_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz             0
-#define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_owner_sz
-#define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz         NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz       NLM_cookie_sz+NLM_lock_sz
-#define NLM_testres_sz          NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz              NLM_cookie_sz+1
-#define NLM_norep_sz            0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep     NULL
-#define PROC(proc, argtype, restype)    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
-        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_arglen    = NLM_##argtype##_sz,                              \
-        .p_replen    = NLM_##restype##_sz,                              \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-static struct rpc_version       nlm_version1 = {
-                .number         = 1,
-                .nrprocs        = 16,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version       nlm_version3 = {
-                .number         = 3,
-                .nrprocs        = 24,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version *     nlm_versions[] = {
-        [1] = &nlm_version1,
-        [3] = &nlm_version3,
-#ifdef  CONFIG_LOCKD_V4
-        [4] = &nlm_version4,
-#endif
-};
-static struct rpc_stat          nlm_stats;
-struct rpc_program              nlm_program = {
-                .name           = "lockd",
-                .number         = NLM_PROGRAM,
-                .nrvers         = ARRAY_SIZE(nlm_versions),
-                .version        = nlm_versions,
-                .stats          = &nlm_stats,
-};
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145..dfa4789cd46 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(f->size);
-        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-        memcpy(p, f->data, f->size);
-        return p + XDR_QUADLEN(f->size);
-}
 /*
 * Encode and decode owner handle
 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-        return xdr_encode_netobj(p, oh);
-}
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s64                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm4_encode_fh(p, &lock->fh))
-         || !(p = nlm4_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM4_OFFSET_MAX
-         || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        *p++ = htonl(lock->svid);
-        start = loff_t_to_s64(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-        p = xdr_encode_hyper(p, start);
-        p = xdr_encode_hyper(p, len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                __u64                   start, len;
-                __s64                   end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                p = xdr_decode_hyper(p, &start);
-                p = xdr_decode_hyper(p, &len);
-                end = start + len - 1;
-                fl->fl_start = s64_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s64_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz            0
-#define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_owner_sz
-#define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz        NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz      NLM4_cookie_sz+NLM4_lock_sz
-#define NLM4_testres_sz         NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz             NLM4_cookie_sz+1
-#define NLM4_norep_sz           0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep    NULL
-#define PROC(proc, argtype, restype)                                    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
-        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_arglen    = NLM4_##argtype##_sz,                             \
-        .p_replen    = NLM4_##restype##_sz,                             \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm4_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-struct rpc_version      nlm_version4 = {
-        .number         = 4,
-        .nrprocs        = 24,
-        .procs          = nlm4_procedures,
-};
diff --git a/fs/locks.c b/fs/locks.c
index 8729347bcd1..08415b2a6d3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1389,7 +1389,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
                if ((arg == F_WRLCK)
-                    && ((atomic_read(&dentry->d_count) > 1)
+                    && ((dentry->d_count > 1)
                        || (atomic_read(&inode->i_count) > 1)))
                        goto out;
        }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a..f9ddf0c388c 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
        return __logfs_create(dir, dentry, inode, target, destlen);
 }
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        return generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098..03b8c240aed 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
        return __logfs_iget(sb, ino);
 }
+static void logfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
 static void __logfs_destroy_inode(struct inode *inode)
 {
        struct logfs_inode *li = logfs_inode(inode);
        BUG_ON(li->li_block);
        list_del(&li->li_freeing_list);
-        kmem_cache_free(logfs_inode_cache, li);
+        call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e13..9da29706f91 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
-                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
                BUG_ON(err); /* mempool should prevent this */
                err = logfs_erase_segment(sb, segno, 1);
                BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e18..ee99a9f5dfd 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
        err = logfs_write_buf(master_inode, page, 0);
+        if (err)
+                move_page_to_inode(inode, page);
        logfs_put_write_page(page);
        return err;
 }
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237..a25444ab2ba 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
-struct mb_cache {
-        struct list_head                c_cache_list;
-        const char                      *c_name;
-        atomic_t                        c_entry_count;
-        int                             c_max_entries;
-        int                             c_bucket_bits;
-        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
-        struct list_head                *c_index_hash;
-};
 /*
 * Global data: list of all mbcache's, lru list, and a spinlock for
 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a3..ae0b83f476a 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
+static void minix_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, minix_i_callback);
+}
 static void init_once(void *foo)
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index c0d35a3acce..1b9e07728a9 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,7 +23,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/namei.c b/fs/namei.c
index 4ff7ca53053..24ece10470b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
 * This does basic POSIX ACL permission checking
 */
-static int acl_permission_check(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        umode_t                 mode = inode->i_mode;
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-                        int error = check_acl(inode, mask);
+                        int error = check_acl(inode, mask, flags);
                        if (error != -EAGAIN)
                                return error;
                }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
 }
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
+ * @flags:      IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
 */
-int generic_permission(struct inode *inode, int mask,
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+        int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        int ret;
        /*
         * Do the basic POSIX ACL permission checks.
         */
-        ret = acl_permission_check(inode, mask, check_acl);
+        ret = acl_permission_check(inode, mask, flags, check_acl);
        if (ret != -EACCES)
                return ret;
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
        }
        if (inode->i_op->permission)
-                retval = inode->i_op->permission(inode, mask);
+                retval = inode->i_op->permission(inode, mask, 0);
        else
-                retval = generic_permission(inode, mask, inode->i_op->check_acl);
+                retval = generic_permission(inode, mask, 0,
+                                inode->i_op->check_acl);
        if (retval)
                return retval;
@@ -362,6 +368,18 @@ void path_get(struct path *path)
 EXPORT_SYMBOL(path_get);
 /**
+ * path_get_long - get a long reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get_long(struct path *path)
+{
+        mntget_long(path->mnt);
+        dget(path->dentry);
+}
+/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
@@ -375,6 +393,185 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 /**
+ * path_put_long - put a long reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put_long(struct path *path)
+{
+        dput(path->dentry);
+        mntput_long(path->mnt);
+}
+/**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_drop_rcu(nd);
+        return 0;
+}
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *parent = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        /*
+         * If the sequence check on the child dentry passed, then the child has
+         * not been removed from its parent. This means the parent dentry must
+         * be valid and able to take a reference at this point.
+         */
+        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+        BUG_ON(!parent->d_count);
+        parent->d_count++;
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_dentry_drop_rcu(nd, dentry);
+        return 0;
+}
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        nd->flags &= ~LOOKUP_RCU;
+        nd->root.mnt = NULL;
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err_unlock;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return 0;
+err_unlock:
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+        if (likely(nd->flags & LOOKUP_RCU))
+                return nameidata_drop_rcu_last(nd);
+        return 0;
+}
+/**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
@@ -386,10 +583,26 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        int status;
+        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (status == -ECHILD) {
+                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return status;
+                status = dentry->d_op->d_revalidate(dentry, nd);
+        }
+        return status;
+}
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status = dentry->d_op->d_revalidate(dentry, nd);
+        int status;
+        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -397,19 +610,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * the dentry otherwise d_revalidate is asking us
                 * to return a fail status.
                 */
-                if (!status) {
+                if (status < 0) {
+                        /* If we're in rcu-walk, we don't have a ref */
+                        if (!(nd->flags & LOOKUP_RCU))
+                                dput(dentry);
+                        dentry = ERR_PTR(status);
+                } else {
+                        /* Don't d_invalidate in rcu-walk mode */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+                                return ERR_PTR(-ECHILD);
                        if (!d_invalidate(dentry)) {
                                dput(dentry);
                                dentry = NULL;
                        }
-                } else {
-                        dput(dentry);
-                        dentry = ERR_PTR(status);
                }
        }
        return dentry;
 }
+static inline int need_reval_dot(struct dentry *dentry)
+{
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+                return 0;
+        return 1;
+}
 /*
 * force_reval_path - force revalidation of a dentry
 *
@@ -433,13 +663,12 @@ force_reval_path(struct path *path, struct nameidata *nd)
        /*
         * only check on filesystems where it's possible for the dentry to
-         * become stale. It's assumed that if this flag is set then the
+         * become stale.
-         * d_revalidate op will also be defined.
         */
-        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
+        if (!need_reval_dot(dentry))
                return 0;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
@@ -459,26 +688,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
 * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission(struct inode *inode)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
        if (inode->i_op->permission) {
-                ret = inode->i_op->permission(inode, MAY_EXEC);
+                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
-                if (!ret)
+        } else {
-                        goto ok;
+                ret = acl_permission_check(inode, MAY_EXEC, flags,
-                return ret;
+                                inode->i_op->check_acl);
        }
-        ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+        if (likely(!ret))
-        if (!ret)
                goto ok;
+        if (ret == -ECHILD)
+                return ret;
        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
 ok:
-        return security_inode_permission(inode, MAY_EXEC);
+        return security_inode_exec_permission(inode, flags);
 }
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +719,23 @@ static __always_inline void set_root(struct nameidata *nd)
 static int link_path_walk(const char *, struct nameidata *);
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                } while (read_seqcount_retry(&fs->seq, seq));
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+        int ret;
        if (IS_ERR(link))
                goto fail;
@@ -500,8 +745,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                nd->path = nd->root;
                path_get(&nd->root);
        }
+        nd->inode = nd->path.dentry->d_inode;
-        return link_path_walk(link, nd);
+        ret = link_path_walk(link, nd);
+        return ret;
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -516,11 +763,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
-        dput(nd->path.dentry);
+        if (!(nd->flags & LOOKUP_RCU)) {
-        if (nd->path.mnt != path->mnt) {
+                dput(nd->path.dentry);
-                mntput(nd->path.mnt);
+                if (nd->path.mnt != path->mnt)
-                nd->path.mnt = path->mnt;
+                        mntput(nd->path.mnt);
        }
+        nd->path.mnt = path->mnt;
        nd->path.dentry = path->dentry;
 }
@@ -535,9 +783,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
        if (path->mnt != nd->path.mnt) {
                path_to_nameidata(path, nd);
+                nd->inode = nd->path.dentry->d_inode;
                dget(dentry);
        }
        mntget(path->mnt);
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -591,6 +841,20 @@ loop:
        return err;
 }
+static int follow_up_rcu(struct path *path)
+{
+        struct vfsmount *parent;
+        struct dentry *mountpoint;
+        parent = path->mnt->mnt_parent;
+        if (parent == path->mnt)
+                return 0;
+        mountpoint = path->mnt->mnt_mountpoint;
+        path->dentry = mountpoint;
+        path->mnt = parent;
+        return 1;
+}
 int follow_up(struct path *path)
 {
        struct vfsmount *parent;
@@ -612,9 +876,24 @@ int follow_up(struct path *path)
        return 1;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * serialization is taken care of in namespace.c
 */
+static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
+                                struct inode **inode)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted;
+                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+                if (!mounted)
+                        return;
+                path->mnt = mounted;
+                path->dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+                *inode = path->dentry->d_inode;
+        }
+}
 static int __follow_mount(struct path *path)
 {
        int res = 0;
@@ -645,9 +924,6 @@ static void follow_mount(struct path *path)
        }
 }
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
- */
 int follow_down(struct path *path)
 {
        struct vfsmount *mounted;
@@ -663,7 +939,42 @@ int follow_down(struct path *path)
        return 0;
 }
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+        struct inode *inode = nd->inode;
+        set_root_rcu(nd);
+        while(1) {
+                if (nd->path.dentry == nd->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
+                        break;
+                }
+                if (nd->path.dentry != nd->path.mnt->mnt_root) {
+                        struct dentry *old = nd->path.dentry;
+                        struct dentry *parent = old->d_parent;
+                        unsigned seq;
+                        seq = read_seqcount_begin(&parent->d_seq);
+                        if (read_seqcount_retry(&old->d_seq, nd->seq))
+                                return -ECHILD;
+                        inode = parent->d_inode;
+                        nd->path.dentry = parent;
+                        nd->seq = seq;
+                        break;
+                }
+                if (!follow_up_rcu(&nd->path))
+                        break;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+                inode = nd->path.dentry->d_inode;
+        }
+        __follow_mount_rcu(nd, &nd->path, &inode);
+        nd->inode = inode;
+        return 0;
+}
+static void follow_dotdot(struct nameidata *nd)
 {
        set_root(nd);
@@ -684,6 +995,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                        break;
        }
        follow_mount(&nd->path);
+        nd->inode = nd->path.dentry->d_inode;
 }
 /*
@@ -721,17 +1033,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
 *  It _is_ time-critical.
 */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-                     struct path *path)
+                        struct path *path, struct inode **inode)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry, *parent;
+        struct dentry *dentry, *parent = nd->path.dentry;
        struct inode *dir;
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                int err = parent->d_op->d_hash(parent, nd->inode, name);
                if (err < 0)
                        return err;
        }
@@ -741,21 +1053,44 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
-        dentry = __d_lookup(nd->path.dentry, name);
+        if (nd->flags & LOOKUP_RCU) {
-        if (!dentry)
+                unsigned seq;
-                goto need_lookup;
+                *inode = nd->inode;
+                dentry = __d_lookup_rcu(parent, name, &seq, inode);
+                if (!dentry) {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                        goto need_lookup;
+                }
+                /* Memory barrier in read_seqcount_begin of child is enough */
+                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                        return -ECHILD;
+                nd->seq = seq;
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                        goto need_revalidate;
+                path->mnt = mnt;
+                path->dentry = dentry;
+                __follow_mount_rcu(nd, path, inode);
+        } else {
+                dentry = __d_lookup(parent, name);
+                if (!dentry)
+                        goto need_lookup;
 found:
-        if (dentry->d_op && dentry->d_op->d_revalidate)
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-                goto need_revalidate;
+                        goto need_revalidate;
 done:
-        path->mnt = mnt;
+                path->mnt = mnt;
-        path->dentry = dentry;
+                path->dentry = dentry;
-        __follow_mount(path);
+                __follow_mount(path);
+                *inode = path->dentry->d_inode;
+        }
        return 0;
 need_lookup:
-        parent = nd->path.dentry;
        dir = parent->d_inode;
+        BUG_ON(nd->inode != dir);
        mutex_lock(&dir->i_mutex);
        /*
@@ -817,7 +1152,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
-        struct inode *inode;
        int err;
        unsigned int lookup_flags = nd->flags;
        
@@ -826,18 +1160,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                goto return_reval;
-        inode = nd->path.dentry->d_inode;
        if (nd->depth)
                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
+                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission(inode);
+                if (nd->flags & LOOKUP_RCU) {
+                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                        if (err == -ECHILD) {
+                                if (nameidata_drop_rcu(nd))
+                                        return -ECHILD;
+                                goto exec_again;
+                        }
+                } else {
+exec_again:
+                        err = exec_permission(nd->inode, 0);
+                }
                if (err)
                        break;
@@ -868,37 +1212,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                continue;
                }
                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
                err = -ENOENT;
-                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
                if (inode->i_op->follow_link) {
+                        /* We commonly drop rcu-walk here */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
+                        nd->inode = nd->path.dentry->d_inode;
                        err = -ENOENT;
-                        inode = nd->path.dentry->d_inode;
+                        if (!nd->inode)
-                        if (!inode)
                                break;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOTDIR; 
-                if (!inode->i_op->lookup)
+                if (!nd->inode->i_op->lookup)
                        break;
                continue;
                /* here ends the main loop */
@@ -913,32 +1264,39 @@ last_component:
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                goto return_reval;
                }
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
-                inode = next.dentry->d_inode;
                if (follow_on_final(inode, lookup_flags)) {
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
-                        inode = nd->path.dentry->d_inode;
+                        nd->inode = nd->path.dentry->d_inode;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOENT;
-                if (!inode)
+                if (!nd->inode)
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op->lookup)
+                        if (!nd->inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -958,25 +1316,43 @@ return_reval:
                 * We bypassed the ordinary revalidation routines.
                 * We may need to check the cached dentry for staleness.
                 */
-                if (nd->path.dentry && nd->path.dentry->d_sb &&
+                if (need_reval_dot(nd->path.dentry)) {
-                    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
-                        err = -ESTALE;
                        /* Note: we do not d_invalidate() */
-                        if (!nd->path.dentry->d_op->d_revalidate(
+                        err = d_revalidate(nd->path.dentry, nd);
-                                        nd->path.dentry, nd))
+                        if (!err)
+                                err = -ESTALE;
+                        if (err < 0)
                                break;
                }
 return_base:
+                if (nameidata_drop_rcu_last_maybe(nd))
+                        return -ECHILD;
                return 0;
 out_dput:
-                path_put_conditional(&next, nd);
+                if (!(nd->flags & LOOKUP_RCU))
+                        path_put_conditional(&next, nd);
                break;
        }
-        path_put(&nd->path);
+        if (!(nd->flags & LOOKUP_RCU))
+                path_put(&nd->path);
 return_err:
        return err;
 }
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
 static int path_walk(const char *name, struct nameidata *nd)
 {
        struct path save = nd->path;
@@ -1002,6 +1378,93 @@ static int path_walk(const char *name, struct nameidata *nd)
        return result;
 }
+static void path_finish_rcu(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                /* RCU dangling. Cancel it. */
+                nd->flags &= ~LOOKUP_RCU;
+                nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
+        if (nd->file)
+                fput(nd->file);
+}
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+        int retval = 0;
+        int fput_needed;
+        struct file *file;
+        nd->last_type = LAST_ROOT; /* if there are only slashes... */
+        nd->flags = flags | LOOKUP_RCU;
+        nd->depth = 0;
+        nd->root.mnt = NULL;
+        nd->file = NULL;
+        if (*name=='/') {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                        nd->path = nd->root;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->path = fs->pwd;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else {
+                struct dentry *dentry;
+                file = fget_light(dfd, &fput_needed);
+                retval = -EBADF;
+                if (!file)
+                        goto out_fail;
+                dentry = file->f_path.dentry;
+                retval = -ENOTDIR;
+                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        goto fput_fail;
+                retval = file_permission(file, MAY_EXEC);
+                if (retval)
+                        goto fput_fail;
+                nd->path = file->f_path;
+                if (fput_needed)
+                        nd->file = file;
+                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+        }
+        nd->inode = nd->path.dentry->d_inode;
+        return 0;
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
@@ -1042,6 +1505,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                fput_light(file, fput_needed);
        }
+        nd->inode = nd->path.dentry->d_inode;
        return 0;
 fput_fail:
@@ -1054,16 +1518,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_init(dfd, name, flags, nd);
+        int retval;
-        if (!retval)
-                retval = path_walk(name, nd);
+        /*
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+         * Path walking is largely split up into 2 different synchronisation
-                                nd->path.dentry->d_inode))
+         * schemes, rcu-walk and ref-walk (explained in
-                audit_inode(name, nd->path.dentry);
+         * Documentation/filesystems/path-lookup.txt). These share much of the
+         * path walk code, but some things particularly setup, cleanup, and
+         * following mounts are sufficiently divergent that functions are
+         * duplicated. Typically there is a function foo(), and its RCU
+         * analogue, foo_rcu().
+         *
+         * -ECHILD is the error number of choice (just to avoid clashes) that
+         * is returned if some aspect of an rcu-walk fails. Such an error must
+         * be handled by restarting a traditional ref-walk (which will always
+         * be able to complete).
+         */
+        retval = path_init_rcu(dfd, name, flags, nd);
+        if (unlikely(retval))
+                return retval;
+        retval = path_walk_rcu(name, nd);
+        path_finish_rcu(nd);
        if (nd->root.mnt) {
                path_put(&nd->root);
                nd->root.mnt = NULL;
        }
+        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+                /* slower, locked walk */
+                if (retval == -ESTALE)
+                        flags |= LOOKUP_REVAL;
+                retval = path_init(dfd, name, flags, nd);
+                if (unlikely(retval))
+                        return retval;
+                retval = path_walk(name, nd);
+                if (nd->root.mnt) {
+                        path_put(&nd->root);
+                        nd->root.mnt = NULL;
+                }
+        }
+        if (likely(!retval)) {
+                if (unlikely(!audit_dummy_context())) {
+                        if (nd->path.dentry && nd->inode)
+                                audit_inode(name, nd->path.dentry);
+                }
+        }
        return retval;
 }
@@ -1106,10 +1607,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        path_get(&nd->path);
        nd->root = nd->path;
        path_get(&nd->root);
+        nd->inode = nd->path.dentry->d_inode;
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->path.dentry->d_inode))
+                                nd->inode))
                audit_inode(name, nd->path.dentry);
        path_put(&nd->root);
@@ -1125,7 +1627,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        struct dentry *dentry;
        int err;
-        err = exec_permission(inode);
+        err = exec_permission(inode, 0);
        if (err)
                return ERR_PTR(err);
@@ -1133,8 +1635,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (base->d_op && base->d_op->d_hash) {
+        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, name);
+                err = base->d_op->d_hash(base, inode, name);
                dentry = ERR_PTR(err);
                if (err < 0)
                        goto out;
@@ -1147,7 +1649,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
         */
        dentry = d_lookup(base, name);
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
                dentry = do_revalidate(dentry, nd);
        if (!dentry)
@@ -1490,6 +1992,7 @@ out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
@@ -1584,6 +2087,9 @@ exit:
        return ERR_PTR(error);
 }
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
                            int mode, const char *pathname)
@@ -1597,50 +2103,25 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                follow_dotdot(nd);
                dir = nd->path.dentry;
        case LAST_DOT:
-                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+                if (need_reval_dot(dir)) {
-                        if (!dir->d_op->d_revalidate(dir, nd)) {
+                        error = d_revalidate(nd->path.dentry, nd);
+                        if (!error)
                                error = -ESTALE;
+                        if (error < 0)
                                goto exit;
-                        }
                }
                /* fallthrough */
        case LAST_ROOT:
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                /* fallthrough */
        case LAST_BIND:
                audit_inode(pathname, dir);
                goto ok;
        }
        /* trailing slashes? */
-        if (nd->last.name[nd->last.len]) {
+        if (nd->last.name[nd->last.len])
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-        }
-        /* just plain open? */
-        if (!(open_flag & O_CREAT)) {
-                error = do_lookup(nd, &nd->last, path);
-                if (error)
-                        goto exit;
-                error = -ENOENT;
-                if (!path->dentry->d_inode)
-                        goto exit_dput;
-                if (path->dentry->d_inode->i_op->follow_link)
-                        return NULL;
-                error = -ENOTDIR;
-                if (nd->flags & LOOKUP_DIRECTORY) {
-                        if (!path->dentry->d_inode->i_op->lookup)
-                                goto exit_dput;
-                }
-                path_to_nameidata(path, nd);
-                audit_inode(pathname, nd->path.dentry);
-                goto ok;
-        }
-        /* OK, it's O_CREAT */
        mutex_lock(&dir->d_inode->i_mutex);
        path->dentry = lookup_hash(nd);
@@ -1711,8 +2192,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                return NULL;
        path_to_nameidata(path, nd);
+        nd->inode = path->dentry->d_inode;
        error = -EISDIR;
-        if (S_ISDIR(path->dentry->d_inode->i_mode))
+        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
        filp = finish_open(nd, open_flag, acc_mode);
@@ -1743,7 +2225,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct path path;
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
-        int force_reval = 0;
+        int flags;
        if (!(open_flag & O_CREAT))
                mode = 0;
@@ -1772,54 +2254,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (open_flag & O_APPEND)
                acc_mode |= MAY_APPEND;
-        /* find the parent */
+        flags = LOOKUP_OPEN;
-reval:
+        if (open_flag & O_CREAT) {
-        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+                flags |= LOOKUP_CREATE;
+                if (open_flag & O_EXCL)
+                        flags |= LOOKUP_EXCL;
+        }
+        if (open_flag & O_DIRECTORY)
+                flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        filp = get_empty_filp();
+        if (!filp)
+                return ERR_PTR(-ENFILE);
+        filp->f_flags = open_flag;
+        nd.intent.open.file = filp;
+        nd.intent.open.flags = flag;
+        nd.intent.open.create_mode = mode;
+        if (open_flag & O_CREAT)
+                goto creat;
+        /* !O_CREAT, simple open */
+        error = do_path_lookup(dfd, pathname, flags, &nd);
+        if (unlikely(error))
+                goto out_filp;
+        error = -ELOOP;
+        if (!(nd.flags & LOOKUP_FOLLOW)) {
+                if (nd.inode->i_op->follow_link)
+                        goto out_path;
+        }
+        error = -ENOTDIR;
+        if (nd.flags & LOOKUP_DIRECTORY) {
+                if (!nd.inode->i_op->lookup)
+                        goto out_path;
+        }
+        audit_inode(pathname, nd.path.dentry);
+        filp = finish_open(&nd, open_flag, acc_mode);
+        return filp;
+creat:
+        /* OK, have to create the file. Find the parent. */
+        error = path_init_rcu(dfd, pathname,
+                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
        if (error)
-                return ERR_PTR(error);
+                goto out_filp;
-        if (force_reval)
+        error = path_walk_rcu(pathname, &nd);
-                nd.flags |= LOOKUP_REVAL;
+        path_finish_rcu(&nd);
+        if (unlikely(error == -ECHILD || error == -ESTALE)) {
+                /* slower, locked walk */
+                if (error == -ESTALE) {
+reval:
+                        flags |= LOOKUP_REVAL;
+                }
+                error = path_init(dfd, pathname,
+                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+                if (error)
+                        goto out_filp;
-        current->total_link_count = 0;
+                error = path_walk_simple(pathname, &nd);
-        error = link_path_walk(pathname, &nd);
-        if (error) {
-                filp = ERR_PTR(error);
-                goto out;
        }
-        if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+        if (unlikely(error))
+                goto out_filp;
+        if (unlikely(!audit_dummy_context()))
                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component.
         */
+        nd.flags = flags;
-        error = -ENFILE;
-        filp = get_empty_filp();
-        if (filp == NULL)
-                goto exit_parent;
-        nd.intent.open.file = filp;
-        filp->f_flags = open_flag;
-        nd.intent.open.flags = flag;
-        nd.intent.open.create_mode = mode;
-        nd.flags &= ~LOOKUP_PARENT;
-        nd.flags |= LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                nd.flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        nd.flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                nd.flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                nd.flags |= LOOKUP_FOLLOW;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path holder;
-                struct inode *inode = path.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
                /* S_ISDIR part is a temporary automount kludge */
-                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
+                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1840,36 +2352,33 @@ reval:
                        goto exit_dput;
                error = __do_follow_link(&path, &nd, &cookie);
                if (unlikely(error)) {
+                        if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
+                                nd.inode->i_op->put_link(path.dentry, &nd, cookie);
                        /* nd.path had been dropped */
-                        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                        nd.path = path;
-                                inode->i_op->put_link(path.dentry, &nd, cookie);
+                        goto out_path;
-                        path_put(&path);
-                        release_open_intent(&nd);
-                        filp = ERR_PTR(error);
-                        goto out;
                }
                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                if (inode->i_op->put_link)
+                if (nd.inode->i_op->put_link)
-                        inode->i_op->put_link(holder.dentry, &nd, cookie);
+                        nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
                path_put(&holder);
        }
 out:
        if (nd.root.mnt)
                path_put(&nd.root);
-        if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-                force_reval = 1;
                goto reval;
-        }
        return filp;
 exit_dput:
        path_put_conditional(&path, &nd);
+out_path:
+        path_put(&nd.path);
+out_filp:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
-exit_parent:
-        path_put(&nd.path);
        filp = ERR_PTR(error);
        goto out;
 }
@@ -2130,12 +2639,10 @@ void dentry_unhash(struct dentry *dentry)
 {
        dget(dentry);
        shrink_dcache_parent(dentry);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) == 2)
+        if (dentry->d_count == 2)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/namespace.c b/fs/namespace.c
index 3dbfc072ec7..3ddfd9046c4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -138,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
        mnt->mnt_group_id = 0;
 }
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+        preempt_disable();
+        mnt->mnt_count += n;
+        preempt_enable();
+#endif
+}
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+        mnt->mnt_count = n;
+#endif
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, 1);
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, -1);
+}
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        unsigned int count = atomic_read(&mnt->mnt_longrefs);
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+        }
+        return count;
+#else
+        return mnt->mnt_count;
+#endif
+}
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -154,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                                goto out_free_id;
                }
-                atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+                if (!mnt->mnt_pcp)
+                        goto out_free_devname;
+                atomic_set(&mnt->mnt_longrefs, 1);
+#else
+                mnt->mnt_count = 1;
+                mnt->mnt_writers = 0;
+#endif
                INIT_LIST_HEAD(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -166,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
-#ifdef CONFIG_SMP
-                mnt->mnt_writers = alloc_percpu(int);
-                if (!mnt->mnt_writers)
-                        goto out_free_devname;
-#else
-                mnt->mnt_writers = 0;
-#endif
        }
        return mnt;
@@ -216,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers++;
 #endif
 }
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers--;
 #endif
 }
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;
        for_each_possible_cpu(cpu) {
-                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }
        return count;
@@ -273,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
        int ret = 0;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        /*
-         * The store to inc_mnt_writers must be visible before we pass
+         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
@@ -289,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
         */
        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
-                dec_mnt_writers(mnt);
+                mnt_dec_writers(mnt);
                ret = -EROFS;
                goto out;
        }
@@ -317,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        preempt_enable();
        return 0;
 }
@@ -351,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
 void mnt_drop_write(struct vfsmount *mnt)
 {
        preempt_disable();
-        dec_mnt_writers(mnt);
+        mnt_dec_writers(mnt);
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -384,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
-        if (count_mnt_writers(mnt) > 0)
+        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt_flags |= MNT_READONLY;
@@ -418,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_writers);
+        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -492,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 /*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+        unsigned u;
+        for (u = 0; u < HASH_SIZE; u++) {
+                struct vfsmount *p;
+                list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+                        if (p->mnt_mountpoint == dentry)
+                                return;
+                }
+        }
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags &= ~DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
+}
+/*
 * vfsmount lock must be held for write
 */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -502,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        mnt->mnt_mountpoint = mnt->mnt_root;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_hash);
-        old_path->dentry->d_mounted--;
+        dentry_reset_mounted(old_path->mnt, old_path->dentry);
 }
 /*
@@ -513,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 {
        child_mnt->mnt_parent = mntget(mnt);
        child_mnt->mnt_mountpoint = dget(dentry);
-        dentry->d_mounted++;
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags |= DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -629,9 +713,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
        return NULL;
 }
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
 {
        struct super_block *sb = mnt->mnt_sb;
        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
@@ -639,38 +724,123 @@ static inline void __mntput(struct vfsmount *mnt)
         * to make r/w->r/o transitions.
         */
        /*
-         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * The locking used to deal with mnt_count decrement provides barriers,
-         * provides barriers, so count_mnt_writers() below is safe.  AV
+         * so mnt_get_writers() below is safe.
         */
-        WARN_ON(count_mnt_writers(mnt));
+        WARN_ON(mnt_get_writers(mnt));
        fsnotify_vfsmount_delete(mnt);
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
 }
-void mntput_no_expire(struct vfsmount *mnt)
+#ifdef CONFIG_SMP
-{
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
-repeat:
+{
-        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+        if (!longrefs) {
-                return;
+put_again:
+                br_read_lock(vfsmount_lock);
+                if (likely(atomic_read(&mnt->mnt_longrefs))) {
+                        mnt_dec_count(mnt);
+                        br_read_unlock(vfsmount_lock);
+                        return;
+                }
+                br_read_unlock(vfsmount_lock);
+        } else {
+                BUG_ON(!atomic_read(&mnt->mnt_longrefs));
+                if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
+                        return;
+        }
        br_write_lock(vfsmount_lock);
-        if (!atomic_dec_and_test(&mnt->mnt_count)) {
+        if (!longrefs)
+                mnt_dec_count(mnt);
+        else
+                atomic_dec(&mnt->mnt_longrefs);
+        if (mnt_get_count(mnt)) {
                br_write_unlock(vfsmount_lock);
                return;
        }
-        if (likely(!mnt->mnt_pinned)) {
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
                br_write_unlock(vfsmount_lock);
-                __mntput(mnt);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
+        }
+        br_write_unlock(vfsmount_lock);
+        mntfree(mnt);
+}
+#else
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
+{
+put_again:
+        mnt_dec_count(mnt);
+        if (likely(mnt_get_count(mnt)))
                return;
+        br_write_lock(vfsmount_lock);
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
+                br_write_unlock(vfsmount_lock);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
        }
-        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-        mnt->mnt_pinned = 0;
        br_write_unlock(vfsmount_lock);
-        acct_auto_close_mnt(mnt);
+        mntfree(mnt);
-        goto repeat;
+}
+#endif
+static void mntput_no_expire(struct vfsmount *mnt)
+{
+        __mntput(mnt, 0);
+}
+void mntput(struct vfsmount *mnt)
+{
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                __mntput(mnt, 0);
+        }
+}
+EXPORT_SYMBOL(mntput);
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+        if (mnt)
+                mnt_inc_count(mnt);
+        return mnt;
 }
-EXPORT_SYMBOL(mntput_no_expire);
+EXPORT_SYMBOL(mntget);
+void mntput_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                __mntput(mnt, 1);
+        }
+#else
+        mntput(mnt);
+#endif
+}
+EXPORT_SYMBOL(mntput_long);
+struct vfsmount *mntget_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (mnt)
+                atomic_inc(&mnt->mnt_longrefs);
+        return mnt;
+#else
+        return mntget(mnt);
+#endif
+}
+EXPORT_SYMBOL(mntget_long);
 void mnt_pin(struct vfsmount *mnt)
 {
@@ -678,19 +848,17 @@ void mnt_pin(struct vfsmount *mnt)
        mnt->mnt_pinned++;
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
-                atomic_inc(&mnt->mnt_count);
+                mnt_inc_count(mnt);
                mnt->mnt_pinned--;
        }
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 static inline void mangle(struct seq_file *m, const char *s)
@@ -985,12 +1153,13 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        br_read_lock(vfsmount_lock);
+        /* write lock needed for mnt_get_count */
+        br_write_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                actual_refs += atomic_read(&p->mnt_count);
+                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -1017,10 +1186,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        br_read_lock(vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1047,7 +1216,7 @@ void release_mounts(struct list_head *head)
                        dput(dentry);
                        mntput(m);
                }
-                mntput(mnt);
+                mntput_long(mnt);
        }
 }
@@ -1073,7 +1242,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
                list_del_init(&p->mnt_child);
                if (p->mnt_parent != p) {
                        p->mnt_parent->mnt_ghosts++;
-                        p->mnt_mountpoint->d_mounted--;
+                        dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
@@ -1102,8 +1271,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;
-                if (atomic_read(&mnt->mnt_count) != 2)
+                /*
+                 * probably don't strictly need the lock here if we examined
+                 * all race cases, but it's a slowpath.
+                 */
+                br_write_lock(vfsmount_lock);
+                if (mnt_get_count(mnt) != 2) {
+                        br_write_lock(vfsmount_lock);
                        return -EBUSY;
+                }
+                br_write_unlock(vfsmount_lock);
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1792,7 +1969,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 unlock:
        up_write(&namespace_sem);
-        mntput(newmnt);
+        mntput_long(newmnt);
        return err;
 }
@@ -2125,11 +2302,11 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                if (fs) {
                        if (p == fs->root.mnt) {
                                rootmnt = p;
-                                fs->root.mnt = mntget(q);
+                                fs->root.mnt = mntget_long(q);
                        }
                        if (p == fs->pwd.mnt) {
                                pwdmnt = p;
-                                fs->pwd.mnt = mntget(q);
+                                fs->pwd.mnt = mntget_long(q);
                        }
                }
                p = next_mnt(p, mnt_ns->root);
@@ -2138,9 +2315,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        up_write(&namespace_sem);
        if (rootmnt)
-                mntput(rootmnt);
+                mntput_long(rootmnt);
        if (pwdmnt)
-                mntput(pwdmnt);
+                mntput_long(pwdmnt);
        return new_ns;
 }
@@ -2327,6 +2504,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2353,6 +2531,7 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
        ns = create_mnt_ns(mnt);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f22b12e7d33..28f136d4aae 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
@@ -74,9 +75,12 @@ const struct inode_operations ncp_dir_inode_operations =
 * Dentry operations routines
 */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+                struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+                const struct dentry *, const struct inode *,
+                unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
 static const struct dentry_operations ncp_dentry_operations =
 {
@@ -113,10 +117,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 #define ncp_preserve_case(i)    (ncp_namespace(i) != NW_NS_DOS)
-static inline int ncp_case_sensitive(struct dentry *dentry)
+static inline int ncp_case_sensitive(const struct inode *i)
 {
 #ifdef CONFIG_NCPFS_NFS_NS
-        return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+        return ncp_namespace(i) == NW_NS_NFS;
 #else
        return 0;
 #endif /* CONFIG_NCPFS_NFS_NS */
@@ -127,14 +131,16 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
 * is case-sensitive.
 */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
-        if (!ncp_case_sensitive(dentry)) {
+        if (!ncp_case_sensitive(inode)) {
+                struct super_block *sb = dentry->d_sb;
                struct nls_table *t;
                unsigned long hash;
                int i;
-                t = NCP_IO_TABLE(dentry);
+                t = NCP_IO_TABLE(sb);
                hash = init_name_hash();
                for (i=0; i<this->len ; i++)
                        hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -145,15 +151,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        if (a->len != b->len)
+        if (len != name->len)
                return 1;
-        if (ncp_case_sensitive(dentry))
+        if (ncp_case_sensitive(pinode))
-                return strncmp(a->name, b->name, a->len);
+                return strncmp(str, name->name, len);
-        return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+        return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 /*
@@ -162,7 +170,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 * Closing files can be safely postponed until iput() - it's done there anyway.
 */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -301,6 +309,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
        int res, val = 0, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -384,21 +395,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
        }
        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dent = list_entry(next, struct dentry, d_u.d_child);
                if ((unsigned long)dent->d_fsdata == fpos) {
                        if (dent->d_inode)
-                                dget_locked(dent);
+                                dget(dent);
                        else
                                dent = NULL;
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                        goto out;
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return NULL;
 out:
@@ -592,7 +603,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        qname.hash = full_name_hash(qname.name, qname.len);
        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, &qname) != 0)
+                if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
                        goto end_advance;
        newdent = d_lookup(dentry, &qname);
@@ -611,35 +622,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                        shrink_dcache_parent(newdent);
                /*
-                 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+                 * NetWare's OS2 namespace is case preserving yet case
-                 * case preserving yet case insensitive.  So we update dentry's name
+                 * insensitive.  So we update dentry's name as received from
-                 * as received from server.  We found dentry via d_lookup with our
+                 * server. Parent dir's i_mutex is locked because we're in
-                 * hash, so we know that hash does not change, and so replacing name
+                 * readdir.
-                 * should be reasonably safe.
                 */
-                if (qname.len == newdent->d_name.len &&
+                dentry_update_name_case(newdent, &qname);
-                    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
-                        struct inode *inode = newdent->d_inode;
-                        /*
-                         * Inside ncpfs all uses of d_name are either for debugging,
-                         * or on functions which acquire inode mutex (mknod, creat,
-                         * lookup).  So grab i_mutex here, to be sure.  d_path
-                         * uses dcache_lock when generating path, so we should too.
-                         * And finally d_compare is protected by dentry's d_lock, so
-                         * here we go.
-                         */
-                        if (inode)
-                                mutex_lock(&inode->i_mutex);
-                        spin_lock(&dcache_lock);
-                        spin_lock(&newdent->d_lock);
-                        memcpy((char *) newdent->d_name.name, qname.name,
-                                                                newdent->d_name.len);
-                        spin_unlock(&newdent->d_lock);
-                        spin_unlock(&dcache_lock);
-                        if (inode)
-                                mutex_unlock(&inode->i_mutex);
-                }
        }
        if (!newdent->d_inode) {
@@ -649,7 +637,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                entry->ino = iunique(dir->i_sb, 2);
                inode = ncp_iget(dir->i_sb, entry);
                if (inode) {
-                        newdent->d_op = &ncp_dentry_operations;
+                        d_set_d_op(newdent, &ncp_dentry_operations);
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
@@ -657,7 +645,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        } else {
                struct inode *inode = newdent->d_inode;
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ncp_update_inode2(inode, entry);
                mutex_unlock(&inode->i_mutex);
        }
@@ -905,7 +893,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        if (inode) {
                ncp_new_dentry(dentry);
 add_entry:
-                dentry->d_op = &ncp_dentry_operations;
+                d_set_d_op(dentry, &ncp_dentry_operations);
                d_add(dentry, inode);
                error = 0;
        }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8fb93b604e7..9b39a5dd413 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -29,6 +29,7 @@
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
 #include <linux/ncp_fs.h>
@@ -58,11 +59,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
+static void ncp_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ncp_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -309,7 +317,12 @@ static void ncp_stop_tasks(struct ncp_server *server) {
        sk->sk_write_space  = server->write_space;
        release_sock(sk);
        del_timer_sync(&server->timeout_tm);
-        flush_scheduled_work();
+        flush_work_sync(&server->rcv.tq);
+        if (sk->sk_socket->type == SOCK_STREAM)
+                flush_work_sync(&server->tx.tq);
+        else
+                flush_work_sync(&server->timeout_tq);
 }
 static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -710,7 +723,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto out_no_root;
-        sb->s_root->d_op = &ncp_root_dentry_operations;
+        d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
        return 0;
 out_no_root:
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634c..1220df75ff2 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -135,7 +135,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
 #define NCP_ESC                 ':'
-#define NCP_IO_TABLE(dentry)    (NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)        (NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)       nls_tolower(t, c)
 #define ncp_toupper(t, c)       nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +150,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
-#define NCP_IO_TABLE(dentry)    NULL
+#define NCP_IO_TABLE(sb)        NULL
 #define ncp_tolower(t, c)       tolower(c)
 #define ncp_toupper(t, c)       toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
+static inline int ncp_strnicmp(const struct nls_table *t,
-                const unsigned char *s2, int len)
+                const unsigned char *s1, const unsigned char *s2, int len)
 {
        while (len--) {
                if (tolower(*s1++) != tolower(*s2++))
@@ -193,7 +193,7 @@ ncp_renew_dentries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -205,7 +205,7 @@ ncp_renew_dentries(struct dentry *parent)
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 static inline void
@@ -215,7 +215,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -223,7 +223,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
                ncp_age_dentry(server, dentry);
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 struct ncp_cache_head {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 93a8b3bd69e..199016528fc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 #include <net/inet_sock.h>
@@ -137,6 +135,33 @@ out_err:
 #if defined(CONFIG_NFS_V4_1)
 /*
+ *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
+ *   */
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
+        struct nfs4_sessionid *bc_sid;
+        if (!serv->sv_bc_xprt)
+                return -EINVAL;
+        /* on success freed in xprt_free */
+        bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
+        if (!bc_sid)
+                return -ENOMEM;
+        memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
+                NFS4_MAX_SESSIONID_LEN);
+        spin_lock_bh(&serv->sv_cb_lock);
+        serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
+        spin_unlock_bh(&serv->sv_cb_lock);
+        dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
+                ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
+                ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
+                serv->sv_bc_xprt);
+        return 0;
+}
+/*
 * The callback service for NFSv4.1 callbacks
 */
 static int
@@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp;
-        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        int ret;
-        dprintk("--> %s\n", __func__);
+        /*
-        /* Create a svc_sock for the service */
+         * Create an svc_sock for the back channel service that shares the
-        bc_xprt = svc_sock_create(serv, xprt->prot);
+         * fore channel connection.
-        if (!bc_xprt)
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+        if (ret < 0) {
+                rqstp = ERR_PTR(ret);
                goto out;
+        }
        /*
         * Save the svc_serv in the transport so that it can
         * be referenced when the session backchannel is initialized
         */
-        serv->bc_xprt = bc_xprt;
        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp))
+        if (IS_ERR(rqstp)) {
-                svc_sock_destroy(bc_xprt);
+                svc_xprt_put(serv->sv_bc_xprt);
+                serv->sv_bc_xprt = NULL;
+        }
 out:
-        dprintk("--> %s return %p\n", __func__, rqstp);
+        dprintk("--> %s return %ld\n", __func__,
+                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
@@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct nfs_callback_data *cb_info)
 {
 }
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+        if (clp->cl_minorversion != 0)
+                return SVC_DROP;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
@@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        return SVC_OK;
 }
+/* pg_authenticate method helper */
+static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+{
+        struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+        int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+        dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+        if (svc_is_backchannel(rqstp))
+                /* Sessionid (usually) set after CB_NULL ping */
+                return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
+                                                  is_cb_compound);
+        else
+                /* No callback identifier in pg_authenticate */
+                return nfs4_find_client_no_ident(svc_addr(rqstp));
+}
+/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
@@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        int ret = SVC_OK;
        /* Don't talk to strangers */
-        clp = nfs_find_client(svc_addr(rqstp), 4);
+        clp = nfs_cb_find_client(rqstp);
        if (clp == NULL)
                return SVC_DROP;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8d..d3b44f9bd74 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct cb_process_state {
+        __be32                  drc_status;
+        struct nfs_client       *clp;
+        struct nfs4_sessionid   *svc_sid; /* v4.1 callback service sessionid */
+};
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
        unsigned int minorversion;
+        unsigned int cb_ident; /* v4.0 callback identifier */
        unsigned nops;
 };
@@ -103,14 +110,23 @@ struct cb_sequenceres {
        uint32_t                        csr_target_highestslotid;
 };
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                       struct cb_sequenceres *res);
+                                       struct cb_sequenceres *res,
+                                       struct cb_process_state *cps);
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
                                             const nfs4_stateid *stateid);
 #define RCA4_TYPE_MASK_RDATA_DLG        0
 #define RCA4_TYPE_MASK_WDATA_DLG        1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 struct cb_recallanyargs {
        struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
        uint32_t        craa_type_mask;
 };
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
        uint32_t        crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
-                                          void *dummy);
+                                         void *dummy,
+                                         struct cb_process_state *cps);
+struct cb_layoutrecallargs {
+        struct sockaddr         *cbl_addr;
+        uint32_t                cbl_recall_type;
+        uint32_t                cbl_layout_type;
+        uint32_t                cbl_layoutchanged;
+        union {
+                struct {
+                        struct nfs_fh           cbl_fh;
+                        struct pnfs_layout_range cbl_range;
+                        nfs4_stateid            cbl_stateid;
+                };
+                struct nfs_fsid         cbl_fsid;
+        };
+};
-#endif /* CONFIG_NFS_V4_1 */
+extern unsigned nfs4_callback_layoutrecall(
+        struct cb_layoutrecallargs *args,
+        void *dummy, struct cb_process_state *cps);
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                                    struct cb_getattrres *res,
+                                    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                                   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61..4bb91cb2620 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                             struct cb_getattrres *res,
+                             struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
+        res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+                goto out;
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs_find_client(args->addr, 4);
-        if (clp == NULL)
-                goto out;
        dprintk("NFS: GETATTR callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        inode = nfs_delegation_find_inode(clp, &args->fh);
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
        if (inode == NULL)
-                goto out_putclient;
+                goto out;
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
        rcu_read_unlock();
        iput(inode);
-out_putclient:
-        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
        return res->status;
 }
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                            struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct inode *inode;
        __be32 res;
        
-        res = htonl(NFS4ERR_BADHANDLE);
+        res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->addr, 4);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        do {
+        res = htonl(NFS4ERR_BADHANDLE);
-                struct nfs_client *prev = clp;
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+        if (inode == NULL)
-                inode = nfs_delegation_find_inode(clp, &args->fh);
+                goto out;
-                if (inode != NULL) {
+        /* Set up a helper thread to actually return the delegation */
-                        /* Set up a helper thread to actually return the delegation */
+        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        case 0:
-                                case 0:
+                res = 0;
-                                        res = 0;
+                break;
-                                        break;
+        case -ENOENT:
-                                case -ENOENT:
+                if (res != 0)
-                                        if (res != 0)
+                        res = htonl(NFS4ERR_BAD_STATEID);
-                                                res = htonl(NFS4ERR_BAD_STATEID);
+                break;
-                                        break;
+        default:
-                                default:
+                res = htonl(NFS4ERR_RESOURCE);
-                                        res = htonl(NFS4ERR_RESOURCE);
+        }
-                        }
+        iput(inode);
-                        iput(inode);
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        bool found = false;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if (nfs_compare_fh(&args->cbl_fh,
+                                   &NFS_I(lo->plh_inode)->fh))
+                        continue;
+                ino = igrab(lo->plh_inode);
+                if (!ino)
+                        continue;
+                found = true;
+                /* Without this, layout can be freed as soon
+                 * as we release cl_lock.
+                 */
+                get_layout_hdr(lo);
+                break;
+        }
+        spin_unlock(&clp->cl_lock);
+        if (!found)
+                return NFS4ERR_NOMATCHING_LAYOUT;
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+            mark_matching_lsegs_invalid(lo, &free_me_list,
+                                        args->cbl_range.iomode))
+                rv = NFS4ERR_DELAY;
+        else
+                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me_list);
+        put_layout_hdr(lo);
+        iput(ino);
+        return rv;
+}
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        struct pnfs_layout_hdr *tmp;
+        LIST_HEAD(recall_list);
+        LIST_HEAD(free_me_list);
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if ((args->cbl_recall_type == RETURN_FSID) &&
+                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                        continue;
+                if (!igrab(lo->plh_inode))
+                        continue;
+                get_layout_hdr(lo);
+                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_add(&lo->plh_bulk_recall, &recall_list);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(lo, tmp,
+                                 &recall_list, plh_bulk_recall) {
+                ino = lo->plh_inode;
+                spin_lock(&ino->i_lock);
+                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                        rv = NFS4ERR_DELAY;
+                list_del_init(&lo->plh_bulk_recall);
+                spin_unlock(&ino->i_lock);
+                put_layout_hdr(lo);
+                iput(ino);
+        }
+        pnfs_free_lseg_list(&free_me_list);
+        return rv;
+}
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+                                    struct cb_layoutrecallargs *args)
+{
+        u32 res = NFS4ERR_DELAY;
+        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+                goto out;
+        if (args->cbl_recall_type == RETURN_FILE)
+                res = initiate_file_draining(clp, args);
+        else
+                res = initiate_bulk_draining(clp, args);
+        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+        dprintk("%s returning %i\n", __func__, res);
+        return res;
+}
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        u32 res;
+        dprintk("%s: -->\n", __func__);
+        if (cps->clp)
+                res = do_callback_layoutrecall(cps->clp, args);
+        else
+                res = NFS4ERR_OP_NOT_IN_SESSION;
+        dprintk("%s: exit with status = %d\n", __func__, res);
+        return cpu_to_be32(res);
+}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+        struct cb_layoutrecallargs args;
+        /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+        memset(&args, 0, sizeof(args));
+        args.cbl_recall_type = RETURN_ALL;
+        /* FIXME we ignore errors, what should we do? */
+        do_callback_layoutrecall(clp, &args);
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-        const struct sockaddr *addr, u32 nfsversion,
-        struct nfs4_sessionid *sessionid)
-{
-        struct nfs_client *clp;
-        clp = nfs_find_client(addr, 4);
-        if (clp == NULL)
-                return NULL;
-        do {
-                struct nfs_client *prev = clp;
-                if (clp->cl_session != NULL) {
-                        if (memcmp(clp->cl_session->sess_id.data,
-                                        sessionid->data,
-                                        NFS4_MAX_SESSIONID_LEN) == 0) {
-                                /* Returns a held reference to clp */
-                                return clp;
-                        }
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
-        return NULL;
-}
-/*
 * For each referring call triple, check the session's slot table for
 * a match.  If the slot is in use and the sequence numbers match, the
 * client is still waiting for a response to the original request.
@@ -276,20 +368,34 @@ out:
 }
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                struct cb_sequenceres *res)
+                              struct cb_sequenceres *res,
+                              struct cb_process_state *cps)
 {
        struct nfs_client *clp;
        int i;
        __be32 status;
+        cps->clp = NULL;
        status = htonl(NFS4ERR_BADSESSION);
-        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        /* Incoming session must match the callback session */
+        if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
+                goto out;
+        clp = nfs4_find_client_sessionid(args->csa_addr,
+                                         &args->csa_sessionid, 1);
        if (clp == NULL)
                goto out;
+        /* state manager is resetting the session */
+        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+                status = NFS4ERR_DELAY;
+                goto out;
+        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
        if (status)
-                goto out_putclient;
+                goto out;
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-                goto out_putclient;
+                goto out;
        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        nfs4_cb_take_slot(clp);
+        cps->clp = clp; /* put in nfs4_callback_compound */
-out_putclient:
-        nfs_put_client(clp);
 out:
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                res->csr_status = 0;
+                cps->drc_status = status;
-        else
+                status = 0;
+        } else
                res->csr_status = status;
        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
                ntohl(status), ntohl(res->csr_status));
        return status;
 }
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+        return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+                               struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        __be32 status;
        fmode_t flags = 0;
-        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->craa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL_ANY callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        status = cpu_to_be32(NFS4ERR_INVAL);
+        if (!validate_bitmap_values(args->craa_type_mask))
+                goto out;
+        status = cpu_to_be32(NFS4_OK);
        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags = FMODE_READ;
        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags |= FMODE_WRITE;
+        if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+                     &args->craa_type_mask))
+                pnfs_recall_all_layouts(cps->clp);
        if (flags)
-                nfs_expire_all_delegation_types(clp, flags);
+                nfs_expire_all_delegation_types(cps->clp, flags);
-        status = htonl(NFS4_OK);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+                                struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs4_slot_table *fc_tbl;
        __be32 status;
        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->crsa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
-        fc_tbl = &clp->cl_session->fc_slot_table;
+        fc_tbl = &cps->clp->cl_session->fc_slot_table;
        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
            args->crsa_target_max_slots < 1)
-                goto out_putclient;
+                goto out;
        status = htonl(NFS4_OK);
        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out_putclient;
+                goto out;
        fc_tbl->target_max_slots = args->crsa_target_max_slots;
-        nfs41_handle_recall_slot(clp);
+        nfs41_handle_recall_slot(cps->clp);
-out_putclient:
-        nfs_put_client(clp);    /* balance nfs_find_client */
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0ed..23112c263f8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR    11050
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+                                        struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        hdr->minorversion = ntohl(*p++);
        /* Check minor version is zero or one. */
        if (hdr->minorversion <= 1) {
-                p++;    /* skip callback_ident */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
                printk(KERN_WARNING "%s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct cb_layoutrecallargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        uint32_t iomode;
+        args->cbl_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        args->cbl_layout_type = ntohl(*p++);
+        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+         * as it is unuseable and ignored with the other types.
+         */
+        iomode = ntohl(*p++);
+        args->cbl_layoutchanged = ntohl(*p++);
+        args->cbl_recall_type = ntohl(*p++);
+        if (args->cbl_recall_type == RETURN_FILE) {
+                args->cbl_range.iomode = iomode;
+                status = decode_fh(xdr, &args->cbl_fh);
+                if (unlikely(status != 0))
+                        goto out;
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_range.offset);
+                p = xdr_decode_hyper(p, &args->cbl_range.length);
+                status = decode_stateid(xdr, &args->cbl_stateid);
+                if (unlikely(status != 0))
+                        goto out;
+        } else if (args->cbl_recall_type == RETURN_FSID) {
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+                p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+        } else if (args->cbl_recall_type != RETURN_ALL) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+                __func__,
+                args->cbl_layout_type, iomode,
+                args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_SEQUENCE:
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
+        case OP_CB_LAYOUTRECALL:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+        struct nfs4_slot_table *tbl = &session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        /*
+         * Let the state manager know callback processing done.
+         * A single slot, so highest used slotid is either 0 or -1
+         */
+        tbl->highest_used_slotid--;
+        nfs4_check_drain_bc_complete(session);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+        if (clp && clp->cl_session)
+                nfs4_callback_free_slot(clp->cl_session);
+}
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        tbl->highest_used_slotid++;
+        BUG_ON(tbl->highest_used_slotid != 0);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
 #else /* CONFIG_NFS_V4_1 */
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp, int* drc_status)
+                struct xdr_stream *xdr_out, void *resp,
+                struct cb_process_state *cps)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (status)
                goto encode_hdr;
-        if (*drc_status) {
+        if (cps->drc_status) {
-                status = *drc_status;
+                status = cps->drc_status;
                goto encode_hdr;
        }
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                status = op->decode_args(rqstp, xdr_in, argp);
                if (likely(status == 0))
-                        status = op->process_op(argp, resp);
+                        status = op->process_op(argp, resp, cps);
        } else
                status = htonl(NFS4ERR_RESOURCE);
-        /* Only set by OP_CB_SEQUENCE processing */
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                *drc_status = status;
-                status = 0;
-        }
 encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
        if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_arg hdr_arg = { 0 };
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
-        __be32 *p;
+        __be32 *p, status;
-        __be32 status, drc_status = 0;
+        struct cb_process_state cps = {
+                .drc_status = 0,
+                .clp = NULL,
+        };
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (status == __constant_htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
+        if (hdr_arg.minorversion == 0) {
+                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                if (!cps.clp)
+                        return rpc_drop_reply;
+        } else
+                cps.svc_sid = bc_xprt_sid(rqstp);
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        while (status == 0 && nops != hdr_arg.nops) {
                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
+                                    &xdr_in, argp, &xdr_out, resp, &cps);
                nops++;
        }
@@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
+        nfs4_cb_free_slot(cps.clp);
+        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
 }
@@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = {
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
        },
 #if defined(CONFIG_NFS_V4_1)
+        [OP_CB_LAYOUTRECALL] = {
+                .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+                .decode_args =
+                        (callback_decode_arg_t)decode_layoutrecall_args,
+                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc..192f2f86026 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+        int ret = 0;
+        if (clp->rpc_ops->version != 4 || minorversion != 0)
+                return ret;
+retry:
+        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+                return -ENOMEM;
+        spin_lock(&nfs_client_lock);
+        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        spin_unlock(&nfs_client_lock);
+        if (ret == -EAGAIN)
+                goto retry;
+        return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        INIT_LIST_HEAD(&clp->cl_delegations);
+        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+        if (err)
+                goto error_cleanup;
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
 }
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
                nfs4_destroy_session(clp->cl_session);
-                clp->cl_session = NULL;
-        }
-        clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
                nfs4_kill_renewd(clp);
-        nfs4_clear_client_minor_version(clp);
+        nfs4_shutdown_session(clp);
        nfs4_destroy_callback(clp);
        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
                nfs_idmap_delete(clp);
        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+        idr_destroy(&cb_ident_idr);
+}
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+        if (clp->cl_cb_ident)
+                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 /*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
                list_del(&clp->cl_share_link);
+                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
-/*
+/* Common match routine for v4.0 and v4.1 callback services */
- * Find a client by IP address and protocol version
+bool
- * - returns NULL if no such client
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- */
+                     u32 minorversion)
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
-{
-        struct nfs_client *clp;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
-                if (!(clp->cl_cons_state == NFS_CS_READY ||
-                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
-                        continue;
-                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->rpc_ops->version != nfsversion)
-                        continue;
-                /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 {
-        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-        u32 nfsvers = clp->rpc_ops->version;
-        spin_lock(&nfs_client_lock);
+        /* Don't match clients that failed to initialise */
-        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+        if (!(clp->cl_cons_state == NFS_CS_READY ||
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+            clp->cl_cons_state == NFS_CS_SESSION_INITING))
+                return false;
-                /* Don't match clients that failed to initialise properly */
+        /* Match the version and minorversion */
-                if (clp->cl_cons_state != NFS_CS_READY)
+        if (clp->rpc_ops->version != 4 ||
-                        continue;
+            clp->cl_minorversion != minorversion)
+                return false;
-                /* Different NFS versions cannot share the same nfs_client */
+        /* Match only the IP address, not the port number */
-                if (clp->rpc_ops->version != nfsvers)
+        if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
+                return false;
-                /* Match only the IP address, not the port number */
+        return true;
-                if (!nfs_sockaddr_match_ipaddr(sap, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
 }
 /*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
        target->options = source->options;
 }
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        spin_lock(&nfs_client_lock);
+        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+}
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+        spin_lock(&nfs_client_lock);
+        list_del_rcu(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        synchronize_rcu();
+}
 /*
 * Allocate and initialise a server record
 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
        /* Zero out the NFS state stuff */
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        INIT_LIST_HEAD(&server->delegations);
        atomic_set(&server->active, 0);
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        pnfs_init_server(server);
        return server;
 }
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        nfs_server_remove_lists(server);
        unset_pnfs_layoutdriver(server);
-        spin_lock(&nfs_client_lock);
-        list_del(&server->client_link);
-        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
        if (server->destroy != NULL)
                server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr);
        return server;
@@ -1125,6 +1156,101 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 0) == false)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = idr_find(&cb_ident_idr, cb_ident);
+        if (clp)
+                atomic_inc(&clp->cl_count);
+        spin_unlock(&nfs_client_lock);
+        return clp;
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
+ * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
+ * can arrive before the callback sessionid is set. For CB_NULL calls,
+ * find a client by IP address protocol version, and minorversion.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                        continue;
+                if (!nfs4_has_session(clp))
+                        continue;
+                /* Match sessionid unless cb_null call*/
+                if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+#else /* CONFIG_NFS_V4_1 */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
 * Initialize the NFS4 callback service
 */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
 out:
        nfs_free_fattr(fattr);
@@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (error < 0)
                goto out_free_server;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1fd62fc49be..364e4328f39 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
        return 0;
 }
-/*
+/**
- * Set up a delegation on an inode
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
 */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+                                  struct nfs_openres *res)
 {
        struct nfs_delegation *delegation;
        struct rpc_cred *oldcred = NULL;
@@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+static struct nfs_delegation *
-                                                           const nfs4_stateid *stateid,
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                                                           struct nfs_client *clp)
+                             struct nfs_server *server)
 {
        struct nfs_delegation *delegation =
                rcu_dereference_protected(nfsi->delegation,
-                                          lockdep_is_held(&clp->cl_lock));
+                                lockdep_is_held(&server->nfs_client->cl_lock));
        if (delegation == NULL)
                goto nomatch;
        spin_lock(&delegation->lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch_unlock:
-        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
-/*
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- * Set up a delegation on an inode
+                                                    struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs_delegation *delegation;
+        spin_lock(&clp->cl_lock);
+        delegation = nfs_detach_delegation_locked(nfsi, server);
+        spin_unlock(&clp->cl_lock);
+        return delegation;
+}
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
@@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock(&clp->cl_lock);
        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                                                   lockdep_is_held(&clp->cl_lock));
+                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
                if (memcmp(&delegation->stateid, &old_delegation->stateid,
                                        sizeof(old_delegation->stateid)) == 0 &&
@@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+                freeme = nfs_detach_delegation_locked(nfsi, server);
        }
-        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -290,73 +321,85 @@ out:
        return err;
 }
-/*
+/**
- * Return all delegations that have been marked for return
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
        int err = 0;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-                if (inode == NULL)
+                                                        &delegation->flags))
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL) {
+                                                                server);
-                        filemap_flush(inode->i_mapping);
+                        rcu_read_unlock();
-                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                        if (delegation != NULL) {
+                                filemap_flush(inode->i_mapping);
+                                err = __nfs_inode_return_delegation(inode,
+                                                                delegation, 0);
+                        }
+                        iput(inode);
+                        if (!err)
+                                goto restart;
+                        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        return err;
                }
-                iput(inode);
-                if (!err)
-                        goto restart;
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                return err;
        }
        rcu_read_unlock();
        return 0;
 }
-/*
+/**
- * This function returns the delegation without reclaiming opens
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
- * or protecting against delegation reclaims.
+ * @inode: inode to process
- * It is therefore really only safe to be called from
+ *
- * nfs4_clear_inode()
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
        }
 }
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
-/*
+/**
- * Return all delegations associated to a super block
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
 */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        if (clp == NULL)
                return;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
-                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
        if (nfs_client_return_marked_delegations(clp) != 0)
                nfs4_schedule_state_manager(clp);
 }
-static
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+                                                 fmode_t flags)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(clp, delegation);
+                        nfs_mark_return_delegation(delegation);
        }
+}
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+                                                        fmode_t flags)
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_all_delegation_types(server, flags);
        rcu_read_unlock();
 }
@@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/*
+/**
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
        nfs_client_mark_return_all_delegations(clp);
 }
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(clp, delegation);
+                nfs_mark_return_delegation(delegation);
        }
-        rcu_read_unlock();
 }
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-        nfs_client_mark_return_unreferenced_delegations(clp);
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_unreferenced_delegations(server);
+        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
 }
-/*
+/**
- * Asynchronous delegation recall!
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+                                      const nfs4_stateid *stateid)
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
                rcu_read_unlock();
                return -ENOENT;
        }
+        nfs_mark_return_delegation(delegation);
-        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
 }
-/*
+static struct inode *
- * Retrieve the inode associated with a delegation
+nfs_delegation_find_inode_server(struct nfs_server *server,
- */
+                                 const struct nfs_fh *fhandle)
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL &&
                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
                if (res != NULL)
                        break;
        }
+        return res;
+}
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+                                        const struct nfs_fh *fhandle)
+{
+        struct nfs_server *server;
+        struct inode *res = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                res = nfs_delegation_find_inode_server(server, fhandle);
+                if (res != NULL)
+                        break;
+        }
        rcu_read_unlock();
        return res;
 }
-/*
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
- * Mark all delegations as needing to be reclaimed
+{
+        struct nfs_delegation *delegation;
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                nfs_delegation_mark_reclaim_server(server);
        rcu_read_unlock();
 }
-/*
+/**
- * Reap all unclaimed delegations after reboot recovery is done
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                if (inode == NULL)
+                                                &delegation->flags) == 0)
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL)
+                                                                server);
-                        nfs_free_delegation(delegation);
+                        rcu_read_unlock();
-                iput(inode);
-                goto restart;
+                        if (delegation != NULL)
+                                nfs_free_delegation(delegation);
+                        iput(inode);
+                        goto restart;
+                }
        }
        rcu_read_unlock();
 }
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        int ret = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                if (!list_empty(&server->delegations)) {
+                        ret = 1;
+                        break;
+                }
+        rcu_read_unlock();
+        return ret;
+}
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda1..d9322e490c5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 996dd8989a9..abe4f0c8dc5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,8 +33,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
+#include <linux/xattr.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 #endif /* CONFIG_NFS_V4 */
@@ -172,7 +173,7 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
@@ -378,14 +379,14 @@ error:
        return error;
 }
-/* Fill in an entry based on the xdr code stored in desc->page */
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
-static
+                      struct nfs_entry *entry, struct xdr_stream *xdr)
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
+        int error;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        error = desc->decode(xdr, entry, desc->plus);
+        if (error)
+                return error;
        entry->fattr->time_start = desc->timestamp;
        entry->fattr->gencount = desc->gencount;
        return 0;
@@ -438,7 +439,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
        if (IS_ERR(inode))
                goto out;
@@ -459,25 +460,26 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-                                void *xdr_page, struct page *page, unsigned int buflen)
+                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf;
+        struct xdr_buf buf = {
-        __be32 *ptr = xdr_page;
+                .pages = xdr_pages,
+                .page_len = buflen,
+                .buflen = buflen,
+                .len = buflen,
+        };
+        struct page *scratch;
        struct nfs_cache_array *array;
        unsigned int count = 0;
        int status;
-        buf.head->iov_base = xdr_page;
+        scratch = alloc_page(GFP_KERNEL);
-        buf.head->iov_len = buflen;
+        if (scratch == NULL)
-        buf.tail->iov_len = 0;
+                return -ENOMEM;
-        buf.page_base = 0;
-        buf.page_len = 0;
-        buf.buflen = buf.head->iov_len;
-        buf.len = buf.head->iov_len;
-        xdr_init_decode(&stream, &buf, ptr);
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        do {
                status = xdr_decode(desc, entry, &stream);
@@ -506,6 +508,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                } else
                        status = PTR_ERR(array);
        }
+        put_page(scratch);
        return status;
 }
@@ -521,7 +525,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
                unsigned int npages)
 {
-        vm_unmap_ram(ptr, npages);
        nfs_readdir_free_pagearray(pages, npages);
 }
@@ -530,9 +533,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 * to nfs_readdir_free_large_page
 */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        void *ptr;
        unsigned int i;
        for (i = 0; i < npages; i++) {
@@ -541,13 +543,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
                        goto out_freepages;
                pages[i] = page;
        }
+        return 0;
-        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-        if (!IS_ERR_OR_NULL(ptr))
-                return ptr;
 out_freepages:
        nfs_readdir_free_pagearray(pages, i);
-        return NULL;
+        return -ENOMEM;
 }
 static
@@ -566,6 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
+        entry.server = NFS_SERVER(inode);
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
@@ -577,8 +578,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
-        pages_ptr = nfs_readdir_large_page(pages, array_size);
+        status = nfs_readdir_large_page(pages, array_size);
-        if (!pages_ptr)
+        if (status < 0)
                goto out_release_array;
        do {
                unsigned int pglen;
@@ -587,7 +588,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                if (status < 0)
                        break;
                pglen = status;
-                status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+                status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
                if (status < 0) {
                        if (status == -ENOSPC)
                                status = 0;
@@ -938,7 +939,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 * component of the path.
 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
 */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+                                                unsigned int mask)
 {
        if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
                return 0;
@@ -1018,7 +1020,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 * If the parent directory is seen to have changed, we throw out the
 * cached dentry and do a new lookup.
 */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
        struct inode *inode;
@@ -1027,6 +1029,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr *fattr = NULL;
        int error;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -1117,7 +1122,7 @@ out_error:
 /*
 * This is called from dput() when d_count is going to 0.
 */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
        dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -1188,7 +1193,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /*
         * If we're doing an exclusive create, optimize away the lookup
@@ -1217,7 +1222,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out_unblock_sillyrename;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-        res = (struct dentry *)inode;
+        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1333,7 +1338,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                res = ERR_PTR(-ENAMETOOLONG);
                goto out;
        }
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
         * the dentry. */
@@ -1351,8 +1356,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
+                attr.ia_mode &= ~current_umask();
-                        attr.ia_mode &= ~current_umask();
        } else {
                open_flags &= ~(O_EXCL | O_CREAT);
                attr.ia_valid = 0;
@@ -1718,11 +1722,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) > 1) {
+        if (dentry->d_count > 1) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
@@ -1733,7 +1735,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                need_rehash = 1;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
        if (!error || error == -ENOENT) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1868,7 +1869,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                 atomic_read(&new_dentry->d_count));
+                 new_dentry->d_count);
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1886,7 +1887,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
-                if (atomic_read(&new_dentry->d_count) > 2) {
+                if (new_dentry->d_count > 2) {
                        int err;
                        /* copy the target dentry's name */
@@ -2188,11 +2189,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
        return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct rpc_cred *cred;
        int res = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2240,7 +2244,7 @@ out:
 out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
-                res = generic_permission(inode, mask, NULL);
+                res = generic_permission(inode, mask, flags, NULL);
        goto out;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index ac7b814ce16..5596c6a2881 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                 * This again causes shrink_dcache_for_umount_subtree() to
                 * Oops, since the test for IS_ROOT() will fail.
                 */
-                spin_lock(&dcache_lock);
+                spin_lock(&sb->s_root->d_inode->i_lock);
+                spin_lock(&sb->s_root->d_lock);
                list_del_init(&sb->s_root->d_alias);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&sb->s_root->d_lock);
+                spin_unlock(&sb->s_root->d_inode->i_lock);
        }
        return 0;
 }
@@ -119,7 +121,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fsinfo.fattr);
        return ret;
@@ -226,7 +228,7 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fattr);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b138..18696882f1c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
 }
-#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c7341..ce00b704452 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
+        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
-        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1438,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+void nfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nfs_i_callback);
+}
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
@@ -1612,6 +1619,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
+        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e6356b750b7..bfa3a34af80 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+                           int);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4f981f1f668..d4c2d6b7507 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
                .authflavor     = RPC_AUTH_UNIX,
                .flags          = RPC_CLNT_CREATE_NOPING,
        };
-        struct mountres result;
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
-                .rpc_resp       = &result,
        };
        struct rpc_clnt *clnt;
        int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        clnt = rpc_create(&args);
-        if (unlikely(IS_ERR(clnt)))
+        if (IS_ERR(clnt))
                goto out_clnt_err;
        dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
 * XDR encode/decode functions for MOUNT
 */
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        if (unlikely(pathname_len > MNTPATHLEN))
+        BUG_ON(pathname_len > MNTPATHLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + pathname_len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, pathname, pathname_len);
-        return 0;
 }
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
-                           const char *dirpath)
+                                const char *dirpath)
 {
-        struct xdr_stream xdr;
+        encode_mntdirpath(xdr, dirpath);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mntdirpath(&xdr, dirpath);
 }
 /*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
-                            struct mountres *res)
+                                struct xdr_stream *xdr,
+                                struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(xdr, res);
-        status = decode_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        return decode_fhandle(&xdr, res);
+        return decode_fhandle(xdr, res);
 }
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
        u32 size;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(size));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        size = ntohl(*p++);
+        size = be32_to_cpup(p);
        if (size > NFS3_FHSIZE || size == 0)
                return -EIO;
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        if (*count == 0)
                return 0;
-        p = xdr_inline_decode(xdr, sizeof(entries));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        entries = ntohl(*p);
+        entries = be32_to_cpup(p);
        dprintk("NFS: received %u auth flavors\n", entries);
        if (entries > NFS_MAX_SECFLAVORS)
                entries = NFS_MAX_SECFLAVORS;
-        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        p = xdr_inline_decode(xdr, 4 * entries);
        if (unlikely(p == NULL))
                return -EIO;
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
                entries = *count;
        for (i = 0; i < entries; i++) {
-                flavors[i] = ntohl(*p++);
+                flavors[i] = be32_to_cpup(p++);
                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
-                             struct mountres *res)
+                                 struct xdr_stream *xdr,
+                                 struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(xdr, res);
-        status = decode_fhs_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        status = decode_fhandle3(&xdr, res);
+        status = decode_fhandle3(xdr, res);
        if (unlikely(status != 0)) {
                res->errno = -EBADHANDLE;
                return 0;
        }
-        return decode_auth_flavors(&xdr, res);
+        return decode_auth_flavors(xdr, res);
 }
 static struct rpc_procinfo mnt_procedures[] = {
        [MOUNTPROC_MNT] = {
                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres_sz,
                .p_statidx      = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
        },
        [MOUNTPROC_UMNT] = {
                .p_proc         = MOUNTPROC_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC_UMNT,
                .p_name         = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres3,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
        },
        [MOUNTPROC3_UMNT] = {
                .p_proc         = MOUNTPROC3_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC3_UMNT,
                .p_name         = "UMOUNT",
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf..74aaf3963c1 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
-        char *end = buffer+buflen;
+        char *end;
        int namelen;
+        unsigned seq;
+rename_retry:
+        end = buffer+buflen;
        *--end = '\0';
        buflen--;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
                *--end = '/';
                dentry = dentry->d_parent;
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        if (*end != '/') {
                if (--buflen < 0)
                        goto Elong;
@@ -82,7 +89,9 @@ char *nfs_path(const char *base,
        memcpy(end, base, namelen);
        return end;
 Elong_unlock:
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c9..792cb13a430 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,584 +61,1008 @@
 #define NFS_readdirres_sz       (1)
 #define NFS_statfsres_sz        (1+NFS_info_sz)
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
 */
-static inline __be32 *
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
 {
-        memcpy(p, fhandle->data, NFS2_FHSIZE);
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        /* NFSv2 handles have a fixed length */
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        fhandle->size = NFS2_FHSIZE;
+                "Remaining buffer length is %tu words.\n",
-        memcpy(fhandle->data, p, NFS2_FHSIZE);
+                func, xdr->end - xdr->p);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+/*
+ *      typedef opaque  nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = 0;        /* NFSv2 does not pass EOF flag on the wire. */
+        result->count = count;
+        return count;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      enum stat {
+ *              NFS_OK = 0,
+ *              NFSERR_PERM = 1,
+ *              NFSERR_NOENT = 2,
+ *              NFSERR_IO = 5,
+ *              NFSERR_NXIO = 6,
+ *              NFSERR_ACCES = 13,
+ *              NFSERR_EXIST = 17,
+ *              NFSERR_NODEV = 19,
+ *              NFSERR_NOTDIR = 20,
+ *              NFSERR_ISDIR = 21,
+ *              NFSERR_FBIG = 27,
+ *              NFSERR_NOSPC = 28,
+ *              NFSERR_ROFS = 30,
+ *              NFSERR_NAMETOOLONG = 63,
+ *              NFSERR_NOTEMPTY = 66,
+ *              NFSERR_DQUOT = 69,
+ *              NFSERR_STALE = 70,
+ *              NFSERR_WFLUSH = 99
+ *      };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32*
+/*
-xdr_encode_time(__be32 *p, struct timespec *timep)
+ * 2.3.2.  ftype
+ *
+ *      enum ftype {
+ *              NFNON = 0,
+ *              NFREG = 1,
+ *              NFDIR = 2,
+ *              NFBLK = 3,
+ *              NFCHR = 4,
+ *              NFLNK = 5
+ *      };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
 {
-        *p++ = htonl(timep->tv_sec);
+        *type = be32_to_cpup(p++);
-        /* Convert nanoseconds into microseconds */
+        if (unlikely(*type > NF2FIFO))
-        *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+                *type = NFBAD;
        return p;
 }
-static inline __be32*
+/*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+ * 2.3.3.  fhandle
+ *
+ *      typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        /*
+        __be32 *p;
-         * Passing the invalid value useconds=1000000 is a
-         * Sun convention for "set to current server time".
+        BUG_ON(fh->size != NFS2_FHSIZE);
-         * It's needed to make permissions checks for the
+        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
-         * "touch" program across v2 mounts to Solaris and
+        memcpy(p, fh->data, NFS2_FHSIZE);
-         * Irix boxes work correctly. See description of
+}
-         * sattr in section 6.1 of "NFS Illustrated" by
-         * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
-         */
+{
-        *p++ = htonl(timep->tv_sec);
+        __be32 *p;
-        *p++ = htonl(1000000);
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.4.  timeval
+ *
+ *      struct timeval {
+ *              unsigned int seconds;
+ *              unsigned int useconds;
+ *      };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        if (timep->tv_nsec != 0)
+                *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+        else
+                *p++ = cpu_to_be32(0);
        return p;
 }
-static inline __be32*
+/*
-xdr_decode_time(__be32 *p, struct timespec *timep)
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+                                              const struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        *p++ = cpu_to_be32(timep->tv_sec);
-        /* Convert microseconds into nanoseconds */
+        *p++ = cpu_to_be32(1000000);
-        timep->tv_nsec = ntohl(*p++) * 1000;
        return p;
 }
-static __be32 *
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+{
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+        return p;
+}
+/*
+ * 2.3.5.  fattr
+ *
+ *      struct fattr {
+ *              ftype           type;
+ *              unsigned int    mode;
+ *              unsigned int    nlink;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              unsigned int    blocksize;
+ *              unsigned int    rdev;
+ *              unsigned int    blocks;
+ *              unsigned int    fsid;
+ *              unsigned int    fileid;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *              timeval         ctime;
+ *      };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        u32 rdev, type;
-        type = ntohl(*p++);
+        __be32 *p;
-        fattr->mode = ntohl(*p++);
-        fattr->nlink = ntohl(*p++);
+        p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-        fattr->uid = ntohl(*p++);
+        if (unlikely(p == NULL))
-        fattr->gid = ntohl(*p++);
+                goto out_overflow;
-        fattr->size = ntohl(*p++);
-        fattr->du.nfs2.blocksize = ntohl(*p++);
-        rdev = ntohl(*p++);
-        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid.major = ntohl(*p++);
-        fattr->fsid.minor = 0;
-        fattr->fileid = ntohl(*p++);
-        p = xdr_decode_time(p, &fattr->atime);
-        p = xdr_decode_time(p, &fattr->mtime);
-        p = xdr_decode_time(p, &fattr->ctime);
        fattr->valid |= NFS_ATTR_FATTR_V2;
+        p = xdr_decode_ftype(p, &type);
+        fattr->mode = be32_to_cpup(p++);
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        fattr->size = be32_to_cpup(p++);
+        fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+        rdev = be32_to_cpup(p++);
        fattr->rdev = new_decode_dev(rdev);
-        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
+        fattr->du.nfs2.blocks = be32_to_cpup(p++);
+        fattr->fsid.major = be32_to_cpup(p++);
+        fattr->fsid.minor = 0;
+        fattr->fileid = be32_to_cpup(p++);
+        p = xdr_decode_time(p, &fattr->atime);
+        p = xdr_decode_time(p, &fattr->mtime);
+        xdr_decode_time(p, &fattr->ctime);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.6.  sattr
+ *
+ *      struct sattr {
+ *              unsigned int    mode;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *      };
+ */
+#define NFS2_SATTR_NOT_SET      (0xffffffff)
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
        return p;
 }
-static inline __be32 *
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+        __be32 *p;
-        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+        p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
-        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
-        if (attr->ia_valid & ATTR_ATIME_SET) {
+        if (attr->ia_valid & ATTR_MODE)
+                *p++ = cpu_to_be32(attr->ia_mode);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_UID)
+                *p++ = cpu_to_be32(attr->ia_uid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_GID)
+                *p++ = cpu_to_be32(attr->ia_gid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_SIZE)
+                *p++ = cpu_to_be32((u32)attr->ia_size);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_ATIME_SET)
                p = xdr_encode_time(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
+        else if (attr->ia_valid & ATTR_ATIME)
                p = xdr_encode_current_server_time(p, &attr->ia_atime);
-        } else {
+        else
-                *p++ = not_set;
+                p = xdr_time_not_set(p);
-                *p++ = not_set;
+        if (attr->ia_valid & ATTR_MTIME_SET)
-        }
+                xdr_encode_time(p, &attr->ia_mtime);
+        else if (attr->ia_valid & ATTR_MTIME)
-        if (attr->ia_valid & ATTR_MTIME_SET) {
+                xdr_encode_current_server_time(p, &attr->ia_mtime);
-                p = xdr_encode_time(p, &attr->ia_mtime);
+        else
-        } else if (attr->ia_valid & ATTR_MTIME) {
+                xdr_time_not_set(p);
-                p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-        } else {
-                *p++ = not_set; 
-                *p++ = not_set;
-        }
-        return p;
 }
 /*
- * NFS encode functions
+ * 2.3.7.  filename
+ *
+ *      typedef string filename<MAXNAMLEN>;
 */
+static void encode_filename(struct xdr_stream *xdr,
+                            const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+static int decode_filename_inline(struct xdr_stream *xdr,
+                                  const char **name, u32 *length)
+{
+        __be32 *p;
+        u32 count;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Encode file handle argument
+ * 2.3.8.  path
- * GETATTR, READLINK, STATFS
+ *
+ *      typedef string path<MAXPATHLEN>;
 */
-static int
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        __be32 *p;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        BUG_ON(length > NFS2_MAXPATHLEN);
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_path(struct xdr_stream *xdr)
+{
+        u32 length, recvd;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p);
+        if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+                goto out_size;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(length > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, length);
+        xdr_terminate_string(xdr->buf, length);
        return 0;
+out_size:
+        dprintk("NFS: returned pathname too long: %u\n", length);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "length %u > received %u\n", length, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * 2.3.9.  attrstat
+ *
+ *      union attrstat switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        enum nfs_stat status;
-        p = xdr_encode_sattr(p, args->sattr);
+        int error;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Encode directory ops argument
+ * 2.3.10.  diropargs
- * LOOKUP, RMDIR
+ *
+ *      struct diropargs {
+ *              fhandle  dir;
+ *              filename name;
+ *      };
 */
-static int
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
+                             const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, fh);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_filename(xdr, name, length);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 2.3.11.  diropres
+ *
+ *      union diropres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      fhandle file;
+ *                      fattr   attributes;
+ *              } diropok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        int error;
-        p = xdr_encode_array(p, args->name.name, args->name.len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        error = decode_fhandle(xdr, result->fh);
-        return 0;
+        if (unlikely(error))
+                goto out;
+        error = decode_fattr(xdr, result->fattr);
+out:
+        return error;
+}
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_diropok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * NFSv2 XDR encode functions
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nfs_fh *fh)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_fhandle(xdr, fh);
-        unsigned int replen;
+}
-        u32 offset = (u32)args->offset;
+/*
+ * 2.2.3.  sattrargs
+ *
+ *      struct sattrargs {
+ *              fhandle file;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_sattrargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        encode_sattr(xdr, args->sattr);
+}
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_diropargs *args)
+{
+        encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_readlinkargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS_readlinkres_sz);
+}
+/*
+ * 2.2.7.  readargs
+ *
+ *      struct readargs {
+ *              fhandle file;
+ *              unsigned offset;
+ *              unsigned count;
+ *              unsigned totalcount;
+ *      };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+                            const struct nfs_readargs *args)
+{
+        u32 offset = args->offset;
        u32 count = args->count;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
+        *p++ = cpu_to_be32(offset);
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+        *p++ = cpu_to_be32(count);
-                         args->pages, args->pgbase, count);
+        *p = cpu_to_be32(count);
+}
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_readargs *args)
+{
+        encode_readargs(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Decode READ reply
+ * 2.2.9.  writeargs
+ *
+ *      struct writeargs {
+ *              fhandle file;
+ *              unsigned beginoffset;
+ *              unsigned offset;
+ *              unsigned totalcount;
+ *              nfsdata data;
+ *      };
 */
-static int
+static void encode_writeargs(struct xdr_stream *xdr,
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                             const struct nfs_writeargs *args)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        u32 offset = args->offset;
-        size_t hdrlen;
+        u32 count = args->count;
-        u32 count, recvd;
+        __be32 *p;
-        int status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_fattr(p, res->fattr);
-        count = ntohl(*p++);
-        res->eof = 0;
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READ reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        encode_fhandle(xdr, args->fh);
-        if (count > recvd) {
-                dprintk("NFS: server cheating in read reply: "
-                        "count %u > recvd %u\n", count, recvd);
-                count = recvd;
-        }
-        dprintk("RPC:      readres OK count %u\n", count);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        if (count < res->count)
+        *p++ = cpu_to_be32(offset);
-                res->count = count;
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(count);
-        return count;
+        /* nfsdata */
+        *p = cpu_to_be32(count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_writeargs *args)
+{
+        encode_writeargs(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
+}
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.10.  createargs
+ *
+ *      struct createargs {
+ *              diropargs where;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_createargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        encode_diropargs(xdr, args->fh, args->name, args->len);
-        u32 offset = (u32)args->offset;
+        encode_sattr(xdr, args->sattr);
-        u32 count = args->count;
+}
-        p = xdr_encode_fhandle(p, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+                                    struct xdr_stream *xdr,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    const struct nfs_removeargs *args)
-        return 0;
+{
+        encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 /*
- * Encode create arguments
+ * 2.2.12.  renameargs
- * CREATE, MKDIR
+ *
+ *      struct renameargs {
+ *              diropargs from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_array(p, args->name, args->len);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode RENAME arguments
+ * 2.2.13.  linkargs
+ *
+ *      struct linkargs {
+ *              fhandle from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->old_dir);
+        encode_fhandle(xdr, args->fromfh);
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 2.2.14.  symlinkargs
+ *
+ *      struct symlinkargs {
+ *              diropargs from;
+ *              path to;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
-        p = xdr_encode_fhandle(p, args->tofh);
+        encode_path(xdr, args->pages, args->pathlen);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        encode_sattr(xdr, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 2.2.17.  readdirargs
+ *
+ *      struct readdirargs {
+ *              fhandle dir;
+ *              nfscookie cookie;
+ *              unsigned count;
+ *      };
 */
-static int
+static void encode_readdirargs(struct xdr_stream *xdr,
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+                               const struct nfs_readdirargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        size_t pad;
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_fhandle(xdr, args->fh);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        *p++ = htonl(args->pathlen);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        *p++ = cpu_to_be32(args->cookie);
+        *p = cpu_to_be32(args->count);
+}
-        /*
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
-         * xdr_encode_pages may have added a few bytes to ensure the
+                                     struct xdr_stream *xdr,
-         * pathname ends on a 4-byte boundary.  Start encoding the
+                                     const struct nfs_readdirargs *args)
-         * attributes after the pad bytes.
+{
-         */
+        encode_readdirargs(xdr, args);
-        pad = sndbuf->tail->iov_len;
+        prepare_reply_buffer(req, args->pages, 0,
-        if (pad > 0)
+                                        args->count, NFS_readdirres_sz);
-                p++;
-        p = xdr_encode_sattr(p, args->sattr);
-        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-        return 0;
 }
 /*
- * Encode arguments to readdir call
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             void *__unused)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        u32 count = args->count;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
-        *p++ = htonl(args->cookie);
+                                 struct nfs_fattr *result)
-        *p++ = htonl(count); /* see above */
+{
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        return decode_attrstat(xdr, result);
+}
-        /* Inline the page array */
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
+                                 struct nfs_diropok *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
+{
-        return 0;
+        return decode_diropres(xdr, result);
 }
 /*
- * Decode the result of a readdir call.
+ * 2.2.6.  readlinkres
- * We're not really decoding anymore, we just leave the buffer untouched
+ *
- * and only check that it is syntactically correct.
+ *      union readlinkres switch (stat status) {
- * The real decoding happens in nfs_decode_entry below, called directly
+ *      case NFS_OK:
- * from nfs_readdir for each entry.
+ *              path data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+                                    struct xdr_stream *xdr, void *__unused)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        enum nfs_stat status;
-        struct kvec *iov = rcvbuf->head;
+        int error;
-        struct page **page;
-        size_t hdrlen;
+        error = decode_stat(xdr, &status);
-        unsigned int pglen, recvd;
+        if (unlikely(error))
-        int status;
+                goto out;
+        if (status != NFS_OK)
-        if ((status = ntohl(*p++)))
+                goto out_default;
-                return nfs_stat_to_errno(status);
+        error = decode_path(xdr);
+out:
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        return error;
-        if (iov->iov_len < hdrlen) {
+out_default:
-                dprintk("NFS: READDIR reply header overflowed:"
+        return nfs_stat_to_errno(status);
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+}
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+/*
-        recvd = rcvbuf->len - hdrlen;
+ * 2.2.7.  readres
-        if (pglen > recvd)
+ *
-                pglen = recvd;
+ *      union readres switch (stat status) {
-        page = rcvbuf->pages;
+ *      case NFS_OK:
-        return pglen;
+ *              fattr attributes;
+ *              nfsdata data;
+ *      default:
+ *              void;
+ *      };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_readres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_nfsdata(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_writeres *result)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        /* All NFSv2 writes are "file sync" writes */
-                "Remaining buffer length is %tu words.\n",
+        result->verf->committed = NFS_FILE_SYNC;
-                func, xdr->end - xdr->p);
+        return decode_attrstat(xdr, result->fattr);
 }
-__be32 *
+/**
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *      struct entry {
+ *              unsigned        fileid;
+ *              filename        name;
+ *              nfscookie       cookie;
+ *              entry           *nextentry;
+ *      };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        __be32 *p;
+        int error;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p++ == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
+                if (unlikely(p == NULL))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p++ == xdr_zero)
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
-        p = xdr_inline_decode(xdr, 8);
+        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
+        entry->ino = be32_to_cpup(p);
-        entry->ino        = ntohl(*p++);
+        error = decode_filename_inline(xdr, &entry->name, &entry->len);
-        entry->len        = ntohl(*p++);
+        if (unlikely(error))
+                return error;
-        p = xdr_inline_decode(xdr, entry->len + 4);
+        /*
-        if (unlikely(!p))
+         * The type (size and byte order) of nfscookie isn't defined in
+         * RFC 1094.  This implementation assumes that it's an XDR uint32.
+         */
+        entry->prev_cookie = entry->cookie;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        entry->name       = (const char *) p;
+        entry->cookie = be32_to_cpup(p);
-        p                += XDR_QUADLEN(entry->len);
-        entry->prev_cookie        = entry->cookie;
-        entry->cookie     = ntohl(*p++);
        entry->d_type = DT_UNKNOWN;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
-}
-/*
- * NFS XDR decode functions
- */
-/*
- * Decode simple status reply
- */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
-{
-        int     status;
-        if ((status = ntohl(*p++)) != 0)
-                status = nfs_stat_to_errno(status);
-        return status;
 }
 /*
- * Decode attrstat reply
+ * 2.2.17.  readdirres
- * GETATTR, SETATTR, WRITE
+ *
- */
+ *      union readdirres switch (stat status) {
-static int
+ *      case NFS_OK:
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+ *              struct {
-{
+ *                      entry *entries;
-        int     status;
+ *                      bool eof;
+ *              } readdirok;
-        if ((status = ntohl(*p++)))
+ *      default:
-                return nfs_stat_to_errno(status);
+ *              void;
-        xdr_decode_fattr(p, fattr);
+ *      };
-        return 0;
+ *
-}
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
-/*
+ * during subsequent nfs_readdir() calls.
- * Decode diropres reply
- * LOOKUP, CREATE, MKDIR
 */
-static int
+static int decode_readdirok(struct xdr_stream *xdr)
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)))
+        pglen = xdr->buf->page_len;
-                return nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_fhandle(p, res->fh);
+        recvd = xdr->buf->len - hdrlen;
-        xdr_decode_fattr(p, res->fattr);
+        if (unlikely(pglen > recvd))
-        return 0;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
- * Encode READLINK args
+                                   struct xdr_stream *xdr, void *__unused)
- */
-static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        p = xdr_encode_fhandle(p, args->fh);
+        error = decode_stat(xdr, &status);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(error))
+                goto out;
-        /* Inline the page array */
+        if (status != NFS_OK)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
+                goto out_default;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+        error = decode_readdirok(xdr);
-        return 0;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 2.2.18.  statfsres
+ *
+ *      union statfsres (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      unsigned tsize;
+ *                      unsigned bsize;
+ *                      unsigned blocks;
+ *                      unsigned bfree;
+ *                      unsigned bavail;
+ *              } info;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        xdr_terminate_string(rcvbuf, len);
+        p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->tsize  = be32_to_cpup(p++);
+        result->bsize  = be32_to_cpup(p++);
+        result->blocks = be32_to_cpup(p++);
+        result->bfree  = be32_to_cpup(p++);
+        result->bavail = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-/*
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
- * Decode WRITE reply
+                                  struct nfs2_fsstat *result)
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
-        res->verf->committed = NFS_FILE_SYNC;
+        enum nfs_stat status;
-        return nfs_xdr_attrstat(req, p, res->fattr);
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_info(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-/*
- * Decode STATFS reply
- */
-static int
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
-{
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        res->tsize  = ntohl(*p++);
-        res->bsize  = ntohl(*p++);
-        res->blocks = ntohl(*p++);
-        res->bfree  = ntohl(*p++);
-        res->bavail = ntohl(*p++);
-        return 0;
-}
 /*
 * We need to translate between nfs status return values and
 * the local errno values which may not be the same.
 */
-static struct {
+static const struct {
        int stat;
        int errno;
 } nfs_errtbl[] = {
@@ -678,28 +1102,30 @@ static struct {
        { -1,                   -EIO            }
 };
-/*
+/**
- * Convert an NFS error code to a local one.
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
- * This one is used jointly by NFSv2 and NFSv3.
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
 */
-int
+int nfs_stat_to_errno(enum nfs_stat status)
-nfs_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-                if (nfs_errtbl[i].stat == stat)
+                if (nfs_errtbl[i].stat == (int)status)
                        return nfs_errtbl[i].errno;
        }
-        dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+        dprintk("NFS: Unrecognized nfs status value: %u\n", status);
        return nfs_errtbl[i].errno;
 }
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
-        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
+        .p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,             \
-        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
+        .p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,             \
        .p_arglen   =  NFS_##argtype##_sz,                              \
        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
@@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat)
        .p_name     =  #proc,                                           \
        }
 struct rpc_procinfo     nfs_procedures[] = {
-    PROC(GETATTR,       fhandle,        attrstat, 1),
+        PROC(GETATTR,   fhandle,        attrstat,       1),
-    PROC(SETATTR,       sattrargs,      attrstat, 0),
+        PROC(SETATTR,   sattrargs,      attrstat,       0),
-    PROC(LOOKUP,        diropargs,      diropres, 2),
+        PROC(LOOKUP,    diropargs,      diropres,       2),
-    PROC(READLINK,      readlinkargs,   readlinkres, 3),
+        PROC(READLINK,  readlinkargs,   readlinkres,    3),
-    PROC(READ,          readargs,       readres, 3),
+        PROC(READ,      readargs,       readres,        3),
-    PROC(WRITE,         writeargs,      writeres, 4),
+        PROC(WRITE,     writeargs,      writeres,       4),
-    PROC(CREATE,        createargs,     diropres, 0),
+        PROC(CREATE,    createargs,     diropres,       0),
-    PROC(REMOVE,        removeargs,     stat, 0),
+        PROC(REMOVE,    removeargs,     stat,           0),
-    PROC(RENAME,        renameargs,     stat, 0),
+        PROC(RENAME,    renameargs,     stat,           0),
-    PROC(LINK,          linkargs,       stat, 0),
+        PROC(LINK,      linkargs,       stat,           0),
-    PROC(SYMLINK,       symlinkargs,    stat, 0),
+        PROC(SYMLINK,   symlinkargs,    stat,           0),
-    PROC(MKDIR,         createargs,     diropres, 0),
+        PROC(MKDIR,     createargs,     diropres,       0),
-    PROC(RMDIR,         diropargs,      stat, 0),
+        PROC(RMDIR,     diropargs,      stat,           0),
-    PROC(READDIR,       readdirargs,    readdirres, 3),
+        PROC(READDIR,   readdirargs,    readdirres,     3),
-    PROC(STATFS,        fhandle,        statfsres, 0),
+        PROC(STATFS,    fhandle,        statfsres,      0),
 };
 struct rpc_version              nfs_version2 = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06da..01c5e8b1941 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz        (1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz            (1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz           (21)
-#define NFS3_wcc_attr_sz                (6)
+#define NFS3_cookieverf_sz      (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz        (6)
 #define NFS3_pre_op_attr_sz     (1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz    (1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz                (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_wcc_data_sz        (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz          
-#define NFS3_fsinfo_sz          
-#define NFS3_pathconf_sz                
-#define NFS3_entry_sz           (NFS3_filename_sz+3)
-#define NFS3_sattrargs_sz       (NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz       (NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_getattrargs_sz     (NFS3_fh_sz)
+#define NFS3_setattrargs_sz     (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz      (NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz    (NFS3_fh_sz)
 #define NFS3_readargs_sz        (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz     (NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz     (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz      (NFS3_fh_sz+3)
-#define NFS3_attrstat_sz        (1+NFS3_fattr_sz)
+#define NFS3_getattrres_sz      (1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz         (1+NFS3_wcc_data_sz)
+#define NFS3_setattrres_sz      (1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz       (NFS3_wccstat_sz)
+#define NFS3_removeres_sz       (NFS3_setattrres_sz)
 #define NFS3_lookupres_sz       (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz       (1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz     (1+NFS3_post_op_attr_sz+1)
@@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
+{
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
                "Remaining buffer length is %tu words.\n",
                func, xdr->end - xdr->p);
 }
 /*
- * Common NFS XDR functions as inlines
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
 */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-        return xdr_encode_array(p, fh->data, fh->size);
+        __be32 *p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
 }
-static inline __be32 *
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
-        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
+        __be32 *p;
-                memcpy(fh->data, p, fh->size);
-                return p + XDR_QUADLEN(fh->size);
+        p = xdr_inline_decode(xdr, 4);
-        }
+        if (unlikely(p == NULL))
-        return NULL;
+                goto out_overflow;
+        *value = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        xdr_decode_hyper(p, value);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * fileid3
+ *
+ *      typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+        return xdr_decode_hyper(p, fileid);
+}
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+        return decode_uint64(xdr, fileid);
+}
+/*
+ * filename3
+ *
+ *      typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+                             const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS3_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
 }
-static inline __be32 *
+static int decode_inline_filename3(struct xdr_stream *xdr,
-xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+                                   const char **name, u32 *length)
 {
        __be32 *p;
+        u32 count;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        fh->size = ntohl(*p++);
+        *name = (const char *)p;
+        *length = count;
+        return 0;
-        if (fh->size <= NFS3_FHSIZE) {
+out_nametoolong:
-                p = xdr_inline_decode(xdr, fh->size);
+        dprintk("NFS: returned filename too long: %u\n", count);
-                if (unlikely(!p))
+        return -ENAMETOOLONG;
-                        goto out_overflow;
+out_overflow:
-                memcpy(fh->data, p, fh->size);
+        print_overflow_msg(__func__, xdr);
-                return p + XDR_QUADLEN(fh->size);
+        return -EIO;
-        }
+}
-        return NULL;
+/*
+ * nfspath3
+ *
+ *      typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+                            const u32 length)
+{
+        BUG_ON(length > NFS3_MAXPATHLEN);
+        encode_uint32(xdr, length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+                goto out_nametoolong;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, count);
+        xdr_terminate_string(xdr->buf, count);
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned pathname too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "count %u > recvd %u\n", count, recvd);
+        return -EIO;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
 /*
- * Encode/decode time.
+ * cookie3
+ *
+ *      typedef uint64 cookie3
 */
-static inline __be32 *
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
-xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
-        *p++ = htonl(timep->tv_sec);
+        return xdr_encode_hyper(p, cookie);
-        *p++ = htonl(timep->tv_nsec);
-        return p;
 }
-static inline __be32 *
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
-xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        return decode_uint64(xdr, cookie);
-        timep->tv_nsec = ntohl(*p++);
+}
-        return p;
+/*
+ * cookieverf3
+ *
+ *      typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+        memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+        return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * createverf3
+ *
+ *      typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+        memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * size3
+ *
+ *      typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+        return xdr_decode_hyper(p, size);
+}
+/*
+ * nfsstat3
+ *
+ *      enum nfsstat3 {
+ *              NFS3_OK = 0,
+ *              ...
+ *      }
+ */
+#define NFS3_OK         NFS_OK
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * ftype3
+ *
+ *      enum ftype3 {
+ *              NF3REG  = 1,
+ *              NF3DIR  = 2,
+ *              NF3BLK  = 3,
+ *              NF3CHR  = 4,
+ *              NF3LNK  = 5,
+ *              NF3SOCK = 6,
+ *              NF3FIFO = 7
+ *      };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+        BUG_ON(type > NF3FIFO);
+        encode_uint32(xdr, type);
 }
-static __be32 *
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        unsigned int    type, major, minor;
+        u32 type;
-        umode_t         fmode;
-        type = ntohl(*p++);
+        type = be32_to_cpup(p++);
        if (type > NF3FIFO)
                type = NF3NON;
-        fmode = nfs_type2fmt[type];
+        *mode = nfs_type2fmt[type];
-        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
+        return p;
-        fattr->nlink = ntohl(*p++);
+}
-        fattr->uid = ntohl(*p++);
-        fattr->gid = ntohl(*p++);
-        p = xdr_decode_hyper(p, &fattr->size);
-        p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-        /* Turn remote device info into Linux-specific dev_t */
-        major = ntohl(*p++);
-        minor = ntohl(*p++);
-        fattr->rdev = MKDEV(major, minor);
-        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid.major);
+/*
-        fattr->fsid.minor = 0;
+ * specdata3
-        p = xdr_decode_hyper(p, &fattr->fileid);
+ *
-        p = xdr_decode_time3(p, &fattr->atime);
+ *     struct specdata3 {
-        p = xdr_decode_time3(p, &fattr->mtime);
+ *             uint32  specdata1;
-        p = xdr_decode_time3(p, &fattr->ctime);
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+        __be32 *p;
-        /* Update the mode bits */
+        p = xdr_reserve_space(xdr, 8);
-        fattr->valid |= NFS_ATTR_FATTR_V3;
+        *p++ = cpu_to_be32(MAJOR(rdev));
+        *p = cpu_to_be32(MINOR(rdev));
+}
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+        unsigned int major, minor;
+        major = be32_to_cpup(p++);
+        minor = be32_to_cpup(p++);
+        *rdev = MKDEV(major, minor);
+        if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+                *rdev = 0;
+        return p;
+}
+/*
+ * nfs_fh3
+ *
+ *      struct nfs_fh3 {
+ *              opaque       data<NFS3_FHSIZE>;
+ *      };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        p = xdr_reserve_space(xdr, 4 + fh->size);
+        xdr_encode_opaque(p, fh->data, fh->size);
+}
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > NFS3_FHSIZE))
+                goto out_toobig;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = length;
+        memcpy(fh->data, p, length);
+        return 0;
+out_toobig:
+        dprintk("NFS: file handle size (%u) too big\n", length);
+        return -E2BIG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(*fh));
+}
+/*
+ * nfstime3
+ *
+ *      struct nfstime3 {
+ *              uint32  seconds;
+ *              uint32  nseconds;
+ *      };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        *p++ = cpu_to_be32(timep->tv_nsec);
        return p;
 }
-static inline __be32 *
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++);
+        return p;
+}
+/*
+ * sattr3
+ *
+ *      enum time_how {
+ *              DONT_CHANGE             = 0,
+ *              SET_TO_SERVER_TIME      = 1,
+ *              SET_TO_CLIENT_TIME      = 2
+ *      };
+ *
+ *      union set_mode3 switch (bool set_it) {
+ *      case TRUE:
+ *              mode3   mode;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_uid3 switch (bool set_it) {
+ *      case TRUE:
+ *              uid3    uid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_gid3 switch (bool set_it) {
+ *      case TRUE:
+ *              gid3    gid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_size3 switch (bool set_it) {
+ *      case TRUE:
+ *              size3   size;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_atime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3        atime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_mtime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3  mtime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct sattr3 {
+ *              set_mode3       mode;
+ *              set_uid3        uid;
+ *              set_gid3        gid;
+ *              set_size3       size;
+ *              set_atime       atime;
+ *              set_mtime       mtime;
+ *      };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+        u32 nbytes;
+        __be32 *p;
+        /*
+         * In order to make only a single xdr_reserve_space() call,
+         * pre-compute the total number of bytes to be reserved.
+         * Six boolean values, one for each set_foo field, are always
+         * present in the encoded result, so start there.
+         */
+        nbytes = 6 * 4;
+        if (attr->ia_valid & ATTR_MODE)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_UID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_GID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_SIZE)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_ATIME_SET)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_MTIME_SET)
+                nbytes += 8;
+        p = xdr_reserve_space(xdr, nbytes);
        if (attr->ia_valid & ATTR_MODE) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_mode & S_IALLUGO);
+                *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_UID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_uid);
+                *p++ = cpu_to_be32(attr->ia_uid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_GID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_gid);
+                *p++ = cpu_to_be32(attr->ia_gid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_SIZE) {
                *p++ = xdr_one;
-                p = xdr_encode_hyper(p, (__u64) attr->ia_size);
+                p = xdr_encode_hyper(p, (u64)attr->ia_size);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_ATIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_atime);
+                p = xdr_encode_nfstime3(p, &attr->ia_atime);
        } else if (attr->ia_valid & ATTR_ATIME) {
                *p++ = xdr_one;
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_MTIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_mtime);
+                xdr_encode_nfstime3(p, &attr->ia_mtime);
        } else if (attr->ia_valid & ATTR_MTIME) {
-                *p++ = xdr_one;
+                *p = xdr_one;
-        } else {
+        } else
-                *p++ = xdr_zero;
+                *p = xdr_zero;
-        }
+}
-        return p;
+/*
+ * fattr3
+ *
+ *      struct fattr3 {
+ *              ftype3          type;
+ *              mode3           mode;
+ *              uint32          nlink;
+ *              uid3            uid;
+ *              gid3            gid;
+ *              size3           size;
+ *              size3           used;
+ *              specdata3       rdev;
+ *              uint64          fsid;
+ *              fileid3         fileid;
+ *              nfstime3        atime;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        umode_t fmode;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_ftype3(p, &fmode);
+        fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &fattr->size);
+        p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+        p = xdr_decode_specdata3(p, &fattr->rdev);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
+        p = xdr_decode_fileid3(p, &fattr->fileid);
+        p = xdr_decode_nfstime3(p, &fattr->atime);
+        p = xdr_decode_nfstime3(p, &fattr->mtime);
+        xdr_decode_nfstime3(p, &fattr->ctime);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+ * post_op_attr
+ *
+ *      union post_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              fattr3  attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-        p = xdr_decode_hyper(p, &fattr->pre_size);
+        __be32 *p;
-        p = xdr_decode_time3(p, &fattr->pre_mtime);
-        p = xdr_decode_time3(p, &fattr->pre_ctime);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_fattr3(xdr, fattr);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * wcc_attr
+ *      struct wcc_attr {
+ *              size3           size;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
                | NFS_ATTR_FATTR_PREMTIME
                | NFS_ATTR_FATTR_PRECTIME;
-        return p;
-}
-static inline __be32 *
+        p = xdr_decode_size3(p, &fattr->pre_size);
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
+        p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
-{
+        xdr_decode_nfstime3(p, &fattr->pre_ctime);
-        if (*p++)
-                p = xdr_decode_fattr(p, fattr);
+        return 0;
-        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+ * pre_op_attr
+ *      union pre_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              wcc_attr        attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ * wcc_data
+ *
+ *      struct wcc_data {
+ *              pre_op_attr     before;
+ *              post_op_attr    after;
+ *      };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        __be32 *p;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (ntohl(*p++)) {
+        if (*p != xdr_zero)
-                p = xdr_inline_decode(xdr, 84);
+                return decode_wcc_attr(xdr, fattr);
-                if (unlikely(!p))
+        return 0;
-                        goto out_overflow;
-                p = xdr_decode_fattr(p, fattr);
-        }
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
-static inline __be32 *
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
-        if (*p++)
+        int error;
-                return xdr_decode_wcc_attr(p, fattr);
-        return p;
+        error = decode_pre_op_attr(xdr, fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, fattr);
+out:
+        return error;
 }
+/*
+ * post_op_fh3
+ *
+ *      union post_op_fh3 switch (bool handle_follows) {
+ *      case TRUE:
+ *              nfs_fh3  handle;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_nfs_fh3(xdr, fh);
+        zero_nfs_fh3(fh);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-static inline __be32 *
+/*
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
+ * diropargs3
+ *
+ *      struct diropargs3 {
+ *              nfs_fh3         dir;
+ *              filename3       name;
+ *      };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+                              const char *name, u32 length)
 {
-        p = xdr_decode_pre_op_attr(p, fattr);
+        encode_nfs_fh3(xdr, fh);
-        return xdr_decode_post_op_attr(p, fattr);
+        encode_filename3(xdr, name, length);
 }
 /*
- * NFS encode functions
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Encode file handle argument
+ * 3.3.1  GETATTR3args
+ *
+ *      struct GETATTR3args {
+ *              nfs_fh3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        encode_nfs_fh3(xdr, fh);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SETATTR arguments
+ * 3.3.2  SETATTR3args
+ *
+ *      union sattrguard3 switch (bool check) {
+ *      case TRUE:
+ *              nfstime3  obj_ctime;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ *      struct SETATTR3args {
+ *              nfs_fh3         object;
+ *              sattr3          new_attributes;
+ *              sattrguard3     guard;
+ *      };
 */
-static int
+static void encode_sattrguard3(struct xdr_stream *xdr,
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
+                               const struct nfs3_sattrargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->guard);
+        if (args->guard) {
-        if (args->guard)
+                p = xdr_reserve_space(xdr, 4 + 8);
-                p = xdr_encode_time3(p, &args->guardtime);
+                *p++ = xdr_one;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                xdr_encode_nfstime3(p, &args->guardtime);
-        return 0;
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                *p = xdr_zero;
+        }
+}
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_sattrargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        encode_sattr3(xdr, args->sattr);
+        encode_sattrguard3(xdr, args);
 }
 /*
- * Encode directory ops argument
+ * 3.3.3  LOOKUP3args
+ *
+ *      struct LOOKUP3args {
+ *              diropargs3  what;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 3.3.4  ACCESS3args
+ *
+ *      struct ACCESS3args {
+ *              nfs_fh3         object;
+ *              uint32          access;
+ *      };
 */
-static int
+static void encode_access3args(struct xdr_stream *xdr,
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+                               const struct nfs3_accessargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_uint32(xdr, args->access);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        return 0;
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_accessargs *args)
+{
+        encode_access3args(xdr, args);
 }
 /*
- * Encode access() argument
+ * 3.3.5  READLINK3args
+ *
+ *      struct READLINK3args {
+ *              nfs_fh3 symlink;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
+                                       struct xdr_stream *xdr,
+                                       const struct nfs3_readlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        *p++ = htonl(args->access);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                                        args->pglen, NFS3_readlinkres_sz);
-        return 0;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * 3.3.6  READ3args
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      struct READ3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_read3args(struct xdr_stream *xdr,
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+                             const struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        /* Inline the page array */
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
+                                   struct xdr_stream *xdr,
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+                                   const struct nfs_readargs *args)
-                         args->pages, args->pgbase, count);
+{
+        encode_read3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS3_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 3.3.7  WRITE3args
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *              stable_how      stable;
+ *              opaque          data<>;
+ *      };
 */
-static int
+static void encode_write3args(struct xdr_stream *xdr,
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                              const struct nfs_writeargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p++ = cpu_to_be32(args->count);
-        *p++ = htonl(args->stable);
+        *p++ = cpu_to_be32(args->stable);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
-        /* Copy the page array */
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    struct xdr_stream *xdr,
-        return 0;
+                                    const struct nfs_writeargs *args)
+{
+        encode_write3args(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
 }
 /*
- * Encode CREATE arguments
+ * 3.3.8  CREATE3args
+ *
+ *      enum createmode3 {
+ *              UNCHECKED = 0,
+ *              GUARDED   = 1,
+ *              EXCLUSIVE = 2
+ *      };
+ *
+ *      union createhow3 switch (createmode3 mode) {
+ *      case UNCHECKED:
+ *      case GUARDED:
+ *              sattr3       obj_attributes;
+ *      case EXCLUSIVE:
+ *              createverf3  verf;
+ *      };
+ *
+ *      struct CREATE3args {
+ *              diropargs3      where;
+ *              createhow3      how;
+ *      };
 */
-static int
+static void encode_createhow3(struct xdr_stream *xdr,
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
+                              const struct nfs3_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_uint32(xdr, args->createmode);
-        p = xdr_encode_array(p, args->name, args->len);
+        switch (args->createmode) {
+        case NFS3_CREATE_UNCHECKED:
-        *p++ = htonl(args->createmode);
+        case NFS3_CREATE_GUARDED:
-        if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
+                encode_sattr3(xdr, args->sattr);
-                *p++ = args->verifier[0];
+                break;
-                *p++ = args->verifier[1];
+        case NFS3_CREATE_EXCLUSIVE:
-        } else
+                encode_createverf3(xdr, args->verifier);
-                p = xdr_encode_sattr(p, args->sattr);
+                break;
+        default:
+                BUG();
+        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
-        return 0;
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_createargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_createhow3(xdr, args);
 }
 /*
- * Encode MKDIR arguments
+ * 3.3.9  MKDIR3args
+ *
+ *      struct MKDIR3args {
+ *              diropargs3      where;
+ *              sattr3          attributes;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mkdirargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 3.3.10  SYMLINK3args
+ *
+ *      struct symlinkdata3 {
+ *              sattr3          symlink_attributes;
+ *              nfspath3        symlink_data;
+ *      };
+ *
+ *      struct SYMLINK3args {
+ *              diropargs3      where;
+ *              symlinkdata3    symlink;
+ *      };
 */
-static int
+static void encode_symlinkdata3(struct xdr_stream *xdr,
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
+                                const struct nfs3_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        encode_nfspath3(xdr, args->pages, args->pathlen);
-        p = xdr_encode_sattr(p, args->sattr);
+}
-        *p++ = htonl(args->pathlen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Copy the page */
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
-        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
+                                      struct xdr_stream *xdr,
-        return 0;
+                                      const struct nfs3_symlinkargs *args)
+{
+        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_symlinkdata3(xdr, args);
 }
 /*
- * Encode MKNOD arguments
+ * 3.3.11  MKNOD3args
+ *
+ *      struct devicedata3 {
+ *              sattr3          dev_attributes;
+ *              specdata3       spec;
+ *      };
+ *
+ *      union mknoddata3 switch (ftype3 type) {
+ *      case NF3CHR:
+ *      case NF3BLK:
+ *              devicedata3     device;
+ *      case NF3SOCK:
+ *      case NF3FIFO:
+ *              sattr3          pipe_attributes;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct MKNOD3args {
+ *              diropargs3      where;
+ *              mknoddata3      what;
+ *      };
 */
-static int
+static void encode_devicedata3(struct xdr_stream *xdr,
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
+                               const struct nfs3_mknodargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_specdata3(xdr, args->rdev);
-        *p++ = htonl(args->type);
+}
-        p = xdr_encode_sattr(p, args->sattr);
-        if (args->type == NF3CHR || args->type == NF3BLK) {
+static void encode_mknoddata3(struct xdr_stream *xdr,
-                *p++ = htonl(MAJOR(args->rdev));
+                              const struct nfs3_mknodargs *args)
-                *p++ = htonl(MINOR(args->rdev));
+{
+        encode_ftype3(xdr, args->type);
+        switch (args->type) {
+        case NF3CHR:
+        case NF3BLK:
+                encode_devicedata3(xdr, args);
+                break;
+        case NF3SOCK:
+        case NF3FIFO:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NF3REG:
+        case NF3DIR:
+                break;
+        default:
+                BUG();
        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
-        return 0;
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mknodargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_mknoddata3(xdr, args);
 }
 /*
- * Encode RENAME arguments
+ * 3.3.12  REMOVE3args
+ *
+ *      struct REMOVE3args {
+ *              diropargs3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                     struct xdr_stream *xdr,
-{
+                                     const struct nfs_removeargs *args)
-        p = xdr_encode_fhandle(p, args->old_dir);
+{
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 3.3.14  RENAME3args
+ *
+ *      struct RENAME3args {
+ *              diropargs3      from;
+ *              diropargs3      to;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_fhandle(p, args->tofh);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs3(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode arguments to readdir call
+ * 3.3.15  LINK3args
+ *
+ *      struct LINK3args {
+ *              nfs_fh3         file;
+ *              diropargs3      link;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
+                                   struct xdr_stream *xdr,
+                                   const struct nfs3_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
-        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_hyper(p, args->cookie);
-        *p++ = args->verf[0];
-        *p++ = args->verf[1];
-        if (args->plus) {
-                /* readdirplus: need dircount + buffer size.
-                 * We just make sure we make dircount big enough */
-                *p++ = htonl(count >> 3);
-        }
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-        return 0;
 }
 /*
- * Decode the result of a readdir call.
+ * 3.3.16  READDIR3args
- * We just check for syntactical correctness.
+ *
+ *      struct READDIR3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_readdir3args(struct xdr_stream *xdr,
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+                                const struct nfs3_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        struct page **page;
-        size_t hdrlen;
-        u32 recvd, pglen;
-        int status;
-        status = ntohl(*p++);
-        /* Decode post_op_attrs */
-        p = xdr_decode_post_op_attr(p, res->dir_attr);
-        if (status)
-                return nfs_stat_to_errno(status);
-        /* Decode verifier cookie */
-        if (res->verf) {
-                res->verf[0] = *p++;
-                res->verf[1] = *p++;
-        } else {
-                p += 2;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        encode_nfs_fh3(xdr, args->fh);
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
-        recvd = rcvbuf->len - hdrlen;
+        p = xdr_encode_cookie3(p, args->cookie);
-        if (pglen > recvd)
+        p = xdr_encode_cookieverf3(p, args->verf);
-                pglen = recvd;
+        *p = cpu_to_be32(args->count);
-        page = rcvbuf->pages;
+}
-        return pglen;
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_readdirargs *args)
+{
+        encode_readdir3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
-__be32 *
+/*
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * 3.3.17  READDIRPLUS3args
+ *
+ *      struct READDIRPLUS3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          dircount;
+ *              count3          maxcount;
+ *      };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+                                    const struct nfs3_readdirargs *args)
 {
        __be32 *p;
-        struct nfs_entry old = *entry;
-        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        if (!ntohl(*p++)) {
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        p = xdr_inline_decode(xdr, 12);
+        encode_nfs_fh3(xdr, args->fh);
-        if (unlikely(!p))
-                goto out_overflow;
-        p = xdr_decode_hyper(p, &entry->ino);
-        entry->len  = ntohl(*p++);
-        p = xdr_inline_decode(xdr, entry->len + 8);
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
-        if (unlikely(!p))
+        p = xdr_encode_cookie3(p, args->cookie);
-                goto out_overflow;
+        p = xdr_encode_cookieverf3(p, args->verf);
-        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
-        entry->prev_cookie = entry->cookie;
-        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->d_type = DT_UNKNOWN;
-        if (plus) {
-                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-                if (IS_ERR(p))
-                        goto out_overflow_exit;
-                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-                /* In fact, a post_op_fh3: */
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (*p++) {
-                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
-                        if (IS_ERR(p))
-                                goto out_overflow_exit;
-                        /* Ugh -- server reply was truncated */
-                        if (p == NULL) {
-                                dprintk("NFS: FH truncated\n");
-                                *entry = old;
-                                return ERR_PTR(-EAGAIN);
-                        }
-                } else
-                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-        }
-        p = xdr_inline_peek(xdr, 8);
+        /*
-        if (p != NULL)
+         * readdirplus: need dircount + buffer size.
-                entry->eof = !p[0] && p[1];
+         * We just make sure we make dircount big enough
-        else
+         */
-                entry->eof = 0;
+        *p++ = cpu_to_be32(args->count >> 3);
-        return p;
+        *p = cpu_to_be32(args->count);
+}
-out_overflow:
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
-        print_overflow_msg(__func__, xdr);
+                                          struct xdr_stream *xdr,
-out_overflow_exit:
+                                          const struct nfs3_readdirargs *args)
-        return ERR_PTR(-EAGAIN);
+{
+        encode_readdirplus3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Encode COMMIT arguments
+ * 3.3.21  COMMIT3args
+ *
+ *      struct COMMIT3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_commit3args(struct xdr_stream *xdr,
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                               const struct nfs_writeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
+        encode_nfs_fh3(xdr, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(args->count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
-#ifdef CONFIG_NFS_V3_ACL
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
-/*
+                                     struct xdr_stream *xdr,
- * Encode GETACL arguments
+                                     const struct nfs_writeargs *args)
- */
-static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
-                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_commit3args(xdr, args);
-        unsigned int replen;
+}
-        p = xdr_encode_fhandle(p, args->fh);
+#ifdef CONFIG_NFS_V3_ACL
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        if (args->mask & (NFS_ACL | NFS_DFACL)) {
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
-                /* Inline the page array */
+                                     struct xdr_stream *xdr,
-                replen = (RPC_REPHDRSIZE + auth->au_rslack +
+                                     const struct nfs3_getaclargs *args)
-                          ACL3_getaclres_sz) << 2;
+{
-                xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+        encode_nfs_fh3(xdr, args->fh);
-                                 NFSACL_MAXPAGES << PAGE_SHIFT);
+        encode_uint32(xdr, args->mask);
-        }
+        if (args->mask & (NFS_ACL | NFS_DFACL))
-        return 0;
+                prepare_reply_buffer(req, args->pages, 0,
+                                        NFSACL_MAXPAGES << PAGE_SHIFT,
+                                        ACL3_getaclres_sz);
 }
-/*
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
- * Encode SETACL arguments
+                                     struct xdr_stream *xdr,
- */
+                                     const struct nfs3_setaclargs *args)
-static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_setaclargs *args)
 {
-        struct xdr_buf *buf = &req->rq_snd_buf;
        unsigned int base;
-        int err;
+        int error;
-        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        base = req->rq_slen;
+        encode_nfs_fh3(xdr, NFS_FH(args->inode));
+        encode_uint32(xdr, args->mask);
        if (args->npages != 0)
-                xdr_encode_pages(buf, args->pages, 0, args->len);
+                xdr_write_pages(xdr, args->pages, 0, args->len);
-        else
-                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
-                                p + XDR_QUADLEN(args->len));
-        err = nfsacl_encode(buf, base, args->inode,
+        base = req->rq_slen;
+        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
-        if (err > 0)
+        BUG_ON(error < 0);
-                err = nfsacl_encode(buf, base + err, args->inode,
+        error = nfsacl_encode(xdr->buf, base + error, args->inode,
-                                    (args->mask & NFS_DFACL) ?
+                            (args->mask & NFS_DFACL) ?
-                                    args->acl_default : NULL, 1,
+                            args->acl_default : NULL, 1,
-                                    NFS_ACL_DEFAULT);
+                            NFS_ACL_DEFAULT);
-        return (err > 0) ? 0 : err;
+        BUG_ON(error < 0);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 /*
- * NFS XDR decode functions
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Decode attrstat reply.
+ * 3.3.1  GETATTR3res
+ *
+ *      struct GETATTR3resok {
+ *              fattr3          obj_attributes;
+ *      };
+ *
+ *      union GETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              GETATTR3resok  resok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_fattr(p, fattr);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_fattr3(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode status+wcc_data reply
+ * 3.3.2  SETATTR3res
- * SATTR, REMOVE, RMDIR
+ *
+ *      struct SETATTR3resok {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      struct SETATTR3resfail {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      union SETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              SETATTR3resok   resok;
+ *      default:
+ *              SETATTR3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_wcc_data(p, fattr);
+        if (unlikely(error))
-        return status;
+                goto out;
+        error = decode_wcc_data(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+ * 3.3.3  LOOKUP3res
+ *
+ *      struct LOOKUP3resok {
+ *              nfs_fh3         object;
+ *              post_op_attr    obj_attributes;
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      struct LOOKUP3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union LOOKUP3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LOOKUP3resok    resok;
+ *      default:
+ *              LOOKUP3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
 {
-        return nfs3_xdr_wccstat(req, p, res->dir_attr);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfs_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode LOOKUP reply
+ * 3.3.4  ACCESS3res
+ *
+ *      struct ACCESS3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          access;
+ *      };
+ *
+ *      struct ACCESS3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union ACCESS3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              ACCESS3resok    resok;
+ *      default:
+ *              ACCESS3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_accessres *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++))) {
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        } else {
+        if (unlikely(error))
-                if (!(p = xdr_decode_fhandle(p, res->fh)))
+                goto out;
-                        return -errno_NFSERR_IO;
+        error = decode_post_op_attr(xdr, result->fattr);
-                p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        }
+                goto out;
-        xdr_decode_post_op_attr(p, res->dir_attr);
+        if (status != NFS3_OK)
-        return status;
+                goto out_default;
+        error = decode_uint32(xdr, &result->access);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode ACCESS reply
+ * 3.3.5  READLINK3res
+ *
+ *      struct READLINK3resok {
+ *              post_op_attr    symlink_attributes;
+ *              nfspath3        data;
+ *      };
+ *
+ *      struct READLINK3resfail {
+ *              post_op_attr    symlink_attributes;
+ *      };
+ *
+ *      union READLINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READLINK3resok  resok;
+ *      default:
+ *              READLINK3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+                                     struct xdr_stream *xdr,
+                                     struct nfs_fattr *result)
 {
-        int     status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status)
+        error = decode_nfsstat3(xdr, &status);
-                return nfs_stat_to_errno(status);
+        if (unlikely(error))
-        res->access = ntohl(*p++);
+                goto out;
-        return 0;
+        error = decode_post_op_attr(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfspath3(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+ * 3.3.6  READ3res
+ *
+ *      struct READ3resok {
+ *              post_op_attr    file_attributes;
+ *              count3          count;
+ *              bool            eof;
+ *              opaque          data<>;
+ *      };
+ *
+ *      struct READ3resfail {
+ *              post_op_attr    file_attributes;
+ *      };
+ *
+ *      union READ3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READ3resok      resok;
+ *      default:
+ *              READ3resfail    resfail;
+ *      };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+                             struct nfs_readres *result)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        u32 eof, count, ocount, recvd;
-        unsigned int replen;
+        size_t hdrlen;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_inline_decode(xdr, 4 + 4 + 4);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p++);
+        eof = be32_to_cpup(p++);
+        ocount = be32_to_cpup(p++);
+        if (unlikely(ocount != count))
+                goto out_mismatch;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = eof;
+        result->count = count;
+        return count;
+out_mismatch:
+        dprintk("NFS: READ count doesn't match length of opaque: "
+                "count %u != ocount %u\n", count, ocount);
+        return -EIO;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        eof = 0;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        /* Inline the page array */
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
+                                 struct nfs_readres *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+{
-        return 0;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_read3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 3.3.7  WRITE3res
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3resok {
+ *              wcc_data        file_wcc;
+ *              count3          count;
+ *              stable_how      committed;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct WRITE3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union WRITE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              WRITE3resok     resok;
+ *      default:
+ *              WRITE3resfail   resfail;
+ *      };
 */
-static int
+static int decode_write3resok(struct xdr_stream *xdr,
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                              struct nfs_writeres *result)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
-        if (iov->iov_len < hdrlen) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: READLINK reply header overflowed:"
+                goto out_overflow;
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+        result->count = be32_to_cpup(p++);
-                return -errno_NFSERR_IO;
+        result->verf->committed = be32_to_cpup(p++);
-        } else if (iov->iov_len != hdrlen) {
+        if (unlikely(result->verf->committed > NFS_FILE_SYNC))
-                dprintk("NFS: READLINK header is short. "
+                goto out_badvalue;
-                        "iovec will be shifted.\n");
+        memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
+        return result->count;
-        }
+out_badvalue:
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
-        if (recvd < len) {
+        return -EIO;
-                dprintk("NFS: server cheating in readlink reply: "
+out_overflow:
-                                "count %u > recvd %u\n", len, recvd);
+        print_overflow_msg(__func__, xdr);
-                return -EIO;
+        return -EIO;
-        }
+}
-        xdr_terminate_string(rcvbuf, len);
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        return 0;
+                                  struct nfs_writeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_write3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READ reply
+ * 3.3.8  CREATE3res
+ *
+ *      struct CREATE3resok {
+ *              post_op_fh3     obj;
+ *              post_op_attr    obj_attributes;
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      struct CREATE3resfail {
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      union CREATE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              CREATE3resok    resok;
+ *      default:
+ *              CREATE3resfail  resfail;
+ *      };
 */
-static int
+static int decode_create3resok(struct xdr_stream *xdr,
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                               struct nfs3_diropres *result)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        int error;
-        size_t hdrlen;
-        u32 count, ocount, recvd;
+        error = decode_post_op_fh3(xdr, result->fh);
-        int status;
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        /* The server isn't required to return a file handle.
+         * If it didn't, force the client to perform a LOOKUP
+         * to determine the correct file handle and attribute
+         * values for the new object. */
+        if (result->fh->size == 0)
+                result->fattr->valid = 0;
+        error = decode_wcc_data(xdr, result->dir_attr);
+out:
+        return error;
+}
-        status = ntohl(*p++);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_create3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        if (status != 0)
+/*
-                return nfs_stat_to_errno(status);
+ * 3.3.12  REMOVE3res
+ *
+ *      struct REMOVE3resok {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      struct REMOVE3resfail {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      union REMOVE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              REMOVE3resok   resok;
+ *      default:
+ *              REMOVE3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_removeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
+/*
-         * in that it puts the count both in the res struct and in the
+ * 3.3.14  RENAME3res
-         * opaque data count. */
+ *
-        count    = ntohl(*p++);
+ *      struct RENAME3resok {
-        res->eof = ntohl(*p++);
+ *              wcc_data        fromdir_wcc;
-        ocount   = ntohl(*p++);
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      struct RENAME3resfail {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      union RENAME3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              RENAME3resok   resok;
+ *      default:
+ *              RENAME3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_renameres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->old_fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->new_fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        if (ocount != count) {
+/*
-                dprintk("NFS: READ count doesn't match RPC opaque count.\n");
+ * 3.3.15  LINK3res
-                return -errno_NFSERR_IO;
+ *
-        }
+ *      struct LINK3resok {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      struct LINK3resfail {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      union LINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LINK3resok      resok;
+ *      default:
+ *              LINK3resfail    resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs3_linkres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+/**
-        if (iov->iov_len < hdrlen) {
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
-                dprintk("NFS: READ reply header overflowed:"
+ *                      the local page cache
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+ * @xdr: XDR stream where entry resides
-                return -errno_NFSERR_IO;
+ * @entry: buffer to fill in with entry data
-        } else if (iov->iov_len != hdrlen) {
+ * @plus: boolean indicating whether this should be a readdirplus entry
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
+ *
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
+ * Returns zero if successful, otherwise a negative errno value is
-        }
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *      struct entry3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              fhandle3        filehandle;
+ *              post_op_attr3   attributes;
+ *              entry3          *nextentry;
+ *      };
+ *
+ * 3.3.17  entryplus3
+ *      struct entryplus3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              post_op_attr    name_attributes;
+ *              post_op_fh3     name_handle;
+ *              entryplus3      *nextentry;
+ *      };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
+{
+        struct nfs_entry old = *entry;
+        __be32 *p;
+        int error;
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        p = xdr_inline_decode(xdr, 4);
-        if (count > recvd) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: server cheating in read reply: "
+                goto out_overflow;
-                        "count %u > recvd %u\n", count, recvd);
+        if (*p == xdr_zero) {
-                count = recvd;
+                p = xdr_inline_decode(xdr, 4);
-                res->eof = 0;
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
        }
-        if (count < res->count)
+        error = decode_fileid3(xdr, &entry->ino);
-                res->count = count;
+        if (unlikely(error))
+                return error;
-        return count;
+        error = decode_inline_filename3(xdr, &entry->name, &entry->len);
-}
+        if (unlikely(error))
+                return error;
-/*
+        entry->prev_cookie = entry->cookie;
- * Decode WRITE response
+        error = decode_cookie3(xdr, &entry->cookie);
- */
+        if (unlikely(error))
-static int
+                return error;
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-        int     status;
-        status = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
-        p = xdr_decode_wcc_data(p, res->fattr);
-        if (status != 0)
+        if (plus) {
-                return nfs_stat_to_errno(status);
+                entry->fattr->valid = 0;
+                error = decode_post_op_attr(xdr, entry->fattr);
+                if (unlikely(error))
+                        return error;
+                if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+                        entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        res->count = ntohl(*p++);
+                /* In fact, a post_op_fh3: */
-        res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
+                p = xdr_inline_decode(xdr, 4);
-        res->verf->verifier[0] = *p++;
+                if (unlikely(p == NULL))
-        res->verf->verifier[1] = *p++;
+                        goto out_overflow;
+                if (*p != xdr_zero) {
+                        error = decode_nfs_fh3(xdr, entry->fh);
+                        if (unlikely(error)) {
+                                if (error == -E2BIG)
+                                        goto out_truncated;
+                                return error;
+                        }
+                } else
+                        zero_nfs_fh3(entry->fh);
+        }
-        return res->count;
+        return 0;
-}
-/*
+out_overflow:
- * Decode a CREATE response
+        print_overflow_msg(__func__, xdr);
- */
+        return -EAGAIN;
-static int
+out_truncated:
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+        dprintk("NFS: directory entry contains invalid file handle\n");
-{
+        *entry = old;
-        int     status;
+        return -EAGAIN;
-        status = ntohl(*p++);
-        if (status == 0) {
-                if (*p++) {
-                        if (!(p = xdr_decode_fhandle(p, res->fh)))
-                                return -errno_NFSERR_IO;
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                } else {
-                        memset(res->fh, 0, sizeof(*res->fh));
-                        /* Do decode post_op_attr but set it to NULL */
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                        res->fattr->valid = 0;
-                }
-        } else {
-                status = nfs_stat_to_errno(status);
-        }
-        p = xdr_decode_wcc_data(p, res->dir_attr);
-        return status;
 }
 /*
- * Decode RENAME reply
+ * 3.3.16  READDIR3res
+ *
+ *      struct dirlist3 {
+ *              entry3          *entries;
+ *              bool            eof;
+ *      };
+ *
+ *      struct READDIR3resok {
+ *              post_op_attr    dir_attributes;
+ *              cookieverf3     cookieverf;
+ *              dirlist3        reply;
+ *      };
+ *
+ *      struct READDIR3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union READDIR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READDIR3resok   resok;
+ *      default:
+ *              READDIR3resfail resfail;
+ *      };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_dirlist3(struct xdr_stream *xdr)
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)) != 0)
+        pglen = xdr->buf->page_len;
-                status = nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_wcc_data(p, res->old_fattr);
+        recvd = xdr->buf->len - hdrlen;
-        p = xdr_decode_wcc_data(p, res->new_fattr);
+        if (unlikely(pglen > recvd))
-        return status;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int decode_readdir3resok(struct xdr_stream *xdr,
- * Decode LINK reply
+                                struct nfs3_readdirres *result)
- */
-static int
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
-        int     status;
+        int error;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        /* XXX: do we need to check if result->verf != NULL ? */
+        error = decode_cookieverf3(xdr, result->verf);
+        if (unlikely(error))
+                goto out;
+        error = decode_dirlist3(xdr);
+out:
+        return error;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
-                status = nfs_stat_to_errno(status);
+                                    struct xdr_stream *xdr,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                    struct nfs3_readdirres *result)
-        p = xdr_decode_wcc_data(p, res->dir_attr);
+{
-        return status;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_readdir3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSSTAT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *      struct FSSTAT3resok {
+ *              post_op_attr    obj_attributes;
+ *              size3           tbytes;
+ *              size3           fbytes;
+ *              size3           abytes;
+ *              size3           tfiles;
+ *              size3           ffiles;
+ *              size3           afiles;
+ *              uint32          invarsec;
+ *      };
+ *
+ *      struct FSSTAT3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSSTAT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSSTAT3resok    resok;
+ *      default:
+ *              FSSTAT3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsstat3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+                               struct nfs_fsstat *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_hyper(p, &res->tbytes);
-        p = xdr_decode_hyper(p, &res->fbytes);
-        p = xdr_decode_hyper(p, &res->abytes);
-        p = xdr_decode_hyper(p, &res->tfiles);
-        p = xdr_decode_hyper(p, &res->ffiles);
-        p = xdr_decode_hyper(p, &res->afiles);
+        p = xdr_inline_decode(xdr, 8 * 6 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_size3(p, &result->tbytes);
+        p = xdr_decode_size3(p, &result->fbytes);
+        p = xdr_decode_size3(p, &result->abytes);
+        p = xdr_decode_size3(p, &result->tfiles);
+        p = xdr_decode_size3(p, &result->ffiles);
+        xdr_decode_size3(p, &result->afiles);
        /* ignore invarsec */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsstat *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsstat3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSINFO reply
+ * 3.3.19  FSINFO3res
+ *
+ *      struct FSINFO3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          rtmax;
+ *              uint32          rtpref;
+ *              uint32          rtmult;
+ *              uint32          wtmax;
+ *              uint32          wtpref;
+ *              uint32          wtmult;
+ *              uint32          dtpref;
+ *              size3           maxfilesize;
+ *              nfstime3        time_delta;
+ *              uint32          properties;
+ *      };
+ *
+ *      struct FSINFO3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSINFO3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSINFO3resok    resok;
+ *      default:
+ *              FSINFO3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+                               struct nfs_fsinfo *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->rtmax  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
-        res->rtpref = ntohl(*p++);
+        if (unlikely(p == NULL))
-        res->rtmult = ntohl(*p++);
+                goto out_overflow;
-        res->wtmax  = ntohl(*p++);
+        result->rtmax  = be32_to_cpup(p++);
-        res->wtpref = ntohl(*p++);
+        result->rtpref = be32_to_cpup(p++);
-        res->wtmult = ntohl(*p++);
+        result->rtmult = be32_to_cpup(p++);
-        res->dtpref = ntohl(*p++);
+        result->wtmax  = be32_to_cpup(p++);
-        p = xdr_decode_hyper(p, &res->maxfilesize);
+        result->wtpref = be32_to_cpup(p++);
-        p = xdr_decode_time3(p, &res->time_delta);
+        result->wtmult = be32_to_cpup(p++);
+        result->dtpref = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &result->maxfilesize);
+        xdr_decode_nfstime3(p, &result->time_delta);
        /* ignore properties */
-        res->lease_time = 0;
+        result->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsinfo3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode PATHCONF reply
+ * 3.3.20  PATHCONF3res
+ *
+ *      struct PATHCONF3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          linkmax;
+ *              uint32          name_max;
+ *              bool            no_trunc;
+ *              bool            chown_restricted;
+ *              bool            case_insensitive;
+ *              bool            case_preserving;
+ *      };
+ *
+ *      struct PATHCONF3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union PATHCONF3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              PATHCONF3resok  resok;
+ *      default:
+ *              PATHCONF3resfail resfail;
+ *      };
 */
-static int
+static int decode_pathconf3resok(struct xdr_stream *xdr,
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
+                                 struct nfs_pathconf *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->max_link = ntohl(*p++);
-        res->max_namelen = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 6);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->max_link = be32_to_cpup(p++);
+        result->max_namelen = be32_to_cpup(p);
        /* ignore remaining fields */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_pathconf *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_pathconf3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode COMMIT reply
+ * 3.3.21  COMMIT3res
+ *
+ *      struct COMMIT3resok {
+ *              wcc_data        file_wcc;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct COMMIT3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union COMMIT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              COMMIT3resok    resok;
+ *      default:
+ *              COMMIT3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs_writeres *result)
 {
-        int             status;
+        enum nfs_stat status;
+        int error;
-        status = ntohl(*p++);
-        p = xdr_decode_wcc_data(p, res->fattr);
+        error = decode_nfsstat3(xdr, &status);
-        if (status != 0)
+        if (unlikely(error))
-                return nfs_stat_to_errno(status);
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
-        res->verf->verifier[0] = *p++;
+        if (unlikely(error))
-        res->verf->verifier[1] = *p++;
+                goto out;
-        return 0;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 #ifdef CONFIG_NFS_V3_ACL
-/*
- * Decode GETACL reply
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
- */
+                                      struct nfs3_getaclres *result)
-static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_getaclres *res)
 {
-        struct xdr_buf *buf = &req->rq_rcv_buf;
-        int status = ntohl(*p++);
        struct posix_acl **acl;
        unsigned int *aclcnt;
-        int err, base;
+        size_t hdrlen;
+        int error;
-        if (status != 0)
-                return nfs_stat_to_errno(status);
+        error = decode_post_op_attr(xdr, result->fattr);
-        p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        res->mask = ntohl(*p++);
+                goto out;
-        if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+        error = decode_uint32(xdr, &result->mask);
-                return -EINVAL;
+        if (unlikely(error))
-        base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
+                goto out;
+        error = -EINVAL;
-        acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+        if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-        aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+                goto out;
-        err = nfsacl_decode(buf, base, aclcnt, acl);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
-        aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+        acl = NULL;
-        if (err > 0)
+        if (result->mask & NFS_ACL)
-                err = nfsacl_decode(buf, base + err, aclcnt, acl);
+                acl = &result->acl_access;
-        return (err > 0) ? 0 : err;
+        aclcnt = NULL;
+        if (result->mask & NFS_ACLCNT)
+                aclcnt = &result->acl_access_count;
+        error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+        if (unlikely(error <= 0))
+                goto out;
+        acl = NULL;
+        if (result->mask & NFS_DFACL)
+                acl = &result->acl_default;
+        aclcnt = NULL;
+        if (result->mask & NFS_DFACLCNT)
+                aclcnt = &result->acl_default_count;
+        error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+        if (unlikely(error <= 0))
+                return error;
+        error = 0;
+out:
+        return error;
 }
-/*
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
- * Decode setacl reply.
+                                   struct xdr_stream *xdr,
- */
+                                   struct nfs3_getaclres *result)
-static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
-        int status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_getacl3resok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if (status)
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
-                return nfs_stat_to_errno(status);
+                                   struct xdr_stream *xdr,
-        xdr_decode_post_op_attr(p, fattr);
+                                   struct nfs_fattr *result)
-        return 0;
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_post_op_attr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
-        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
+        .p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,      \
-        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
+        .p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,       \
-        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_arglen    = NFS3_##argtype##args_sz,                         \
-        .p_replen    = NFS3_##restype##_sz,                             \
+        .p_replen    = NFS3_##restype##res_sz,                          \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
        }
 struct rpc_procinfo     nfs3_procedures[] = {
-  PROC(GETATTR,         fhandle,        attrstat, 1),
+        PROC(GETATTR,           getattr,        getattr,        1),
-  PROC(SETATTR,         sattrargs,      wccstat, 0),
+        PROC(SETATTR,           setattr,        setattr,        0),
-  PROC(LOOKUP,          diropargs,      lookupres, 2),
+        PROC(LOOKUP,            lookup,         lookup,         2),
-  PROC(ACCESS,          accessargs,     accessres, 1),
+        PROC(ACCESS,            access,         access,         1),
-  PROC(READLINK,        readlinkargs,   readlinkres, 3),
+        PROC(READLINK,          readlink,       readlink,       3),
-  PROC(READ,            readargs,       readres, 3),
+        PROC(READ,              read,           read,           3),
-  PROC(WRITE,           writeargs,      writeres, 4),
+        PROC(WRITE,             write,          write,          4),
-  PROC(CREATE,          createargs,     createres, 0),
+        PROC(CREATE,            create,         create,         0),
-  PROC(MKDIR,           mkdirargs,      createres, 0),
+        PROC(MKDIR,             mkdir,          create,         0),
-  PROC(SYMLINK,         symlinkargs,    createres, 0),
+        PROC(SYMLINK,           symlink,        create,         0),
-  PROC(MKNOD,           mknodargs,      createres, 0),
+        PROC(MKNOD,             mknod,          create,         0),
-  PROC(REMOVE,          removeargs,     removeres, 0),
+        PROC(REMOVE,            remove,         remove,         0),
-  PROC(RMDIR,           diropargs,      wccstat, 0),
+        PROC(RMDIR,             lookup,         setattr,        0),
-  PROC(RENAME,          renameargs,     renameres, 0),
+        PROC(RENAME,            rename,         rename,         0),
-  PROC(LINK,            linkargs,       linkres, 0),
+        PROC(LINK,              link,           link,           0),
-  PROC(READDIR,         readdirargs,    readdirres, 3),
+        PROC(READDIR,           readdir,        readdir,        3),
-  PROC(READDIRPLUS,     readdirargs,    readdirres, 3),
+        PROC(READDIRPLUS,       readdirplus,    readdir,        3),
-  PROC(FSSTAT,          fhandle,        fsstatres, 0),
+        PROC(FSSTAT,            getattr,        fsstat,         0),
-  PROC(FSINFO,          fhandle,        fsinfores, 0),
+        PROC(FSINFO,            getattr,        fsinfo,         0),
-  PROC(PATHCONF,        fhandle,        pathconfres, 0),
+        PROC(PATHCONF,          getattr,        pathconf,       0),
-  PROC(COMMIT,          commitargs,     commitres, 5),
+        PROC(COMMIT,            commit,         commit,         5),
 };
 struct rpc_version              nfs_version3 = {
@@ -1185,8 +2468,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo      nfs3_acl_procedures[] = {
        [ACLPROC3_GETACL] = {
                .p_proc = ACLPROC3_GETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
                .p_arglen = ACL3_getaclargs_sz,
                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
@@ -1194,8 +2477,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
        [ACLPROC3_SETACL] = {
                .p_proc = ACLPROC3_SETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
                .p_arglen = ACL3_setaclargs_sz,
                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fd..7a747407314 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
-        struct rb_node       so_client_node;
+        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d65..23f930caf1e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 {
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
-        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
        dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        /* find and reference the deviceid */
        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
        if (dsaddr == NULL) {
-                dsaddr = get_device_info(lo->inode, id);
+                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
                        goto out;
        }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4435e5e1f90..9d992b0346e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
 */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
@@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        if (ses->fc_slot_table.highest_used_slotid != -1)
                return;
-        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->complete);
+        complete(&ses->fc_slot_table.complete);
+}
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+            ses->bc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+        complete(&ses->bc_slot_table.complete);
 }
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slot);
-        nfs41_check_drain_session_complete(res->sr_session);
+        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
 }
@@ -1826,6 +1839,8 @@ struct nfs4_closedata {
        struct nfs_closeres res;
        struct nfs_fattr fattr;
        unsigned long timestamp;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_free_closedata(void *data)
@@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state_owner *sp = calldata->state->owner;
+        if (calldata->roc)
+                pnfs_roc_release(calldata->state->inode);
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
@@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
+                        if (calldata->roc)
+                                pnfs_roc_set_barrier(state->inode,
+                                                     calldata->roc_barrier);
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
                        nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                return;
        }
-        if (calldata->arg.fmode == 0)
+        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+                if (calldata->roc &&
+                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+                                     task, NULL);
+                        return;
+                }
+        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->roc = roc;
        path_get(path);
        calldata->path = *path;
-        msg.rpc_argp = &calldata->arg,
+        msg.rpc_argp = &calldata->arg;
-        msg.rpc_resp = &calldata->res,
+        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
        kfree(calldata);
 out:
+        if (roc)
+                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
@@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                path = &ctx->path;
                fmode = ctx->mode;
        }
+        sattr->ia_mode &= ~current_umask();
        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
@@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
+                .sc_cb_ident = clp->cl_cb_ident,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                if (signalled())
                        break;
                if (loop++ & 1)
-                        ssleep(clp->cl_lease_time + 1);
+                        ssleep(clp->cl_lease_time / HZ + 1);
                else
                        if (++clp->cl_id_uniquifier == 0)
                                break;
@@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        data->rpc_status = 0;
        task_setup_data.callback_data = data;
-        msg.rpc_argp = &data->args,
+        msg.rpc_argp = &data->args;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
@@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        return rpc_run_task(&task_setup_data);
 }
@@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
        p->lsp = lsp;
        p->server = server;
@@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
                return;
        args->lock_owner.clientid = server->nfs_client->cl_clientid;
        args->lock_owner.id = lsp->ls_id.id;
+        args->lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = args;
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
-                size_t buflen, int flags)
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_set_acl(inode, buf, buflen);
+        return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- * and that's what we'll do for e.g. user attributes that haven't been set.
+                                   void *buf, size_t buflen, int type)
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-                size_t buflen)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_get_acl(inode, buf, buflen);
+        return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
 {
-        size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+        size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
        if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
                return 0;
-        if (buf && buflen < len)
-                return -ERANGE;
+        if (list && len <= list_len)
-        if (buf)
+                memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-                memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
        return len;
 }
@@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+        if (flags & ~EXCHGID4_FLAG_MASK_R)
+                goto out_inval;
+        if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+            (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+                goto out_inval;
+        if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+                goto out_inval;
+        return NFS_OK;
+out_inval:
+        return -NFS4ERR_INVAL;
+}
+/*
 * nfs4_proc_exchange_id()
 *
 * Since the clientid has expired, all compounds using sessions
@@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
                .client = clp,
-                .flags = clp->cl_exchange_flags,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
        struct nfs41_exchange_id_res res = {
                .client = clp,
@@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        /* Remove server-only flags */
-        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                        break;
        }
+        status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
        session->session_state = 1<<NFS4_SESSION_INITING;
@@ -5280,13 +5330,23 @@ static void
 nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
-        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-        struct nfs_server *server = NFS_SERVER(ino);
        dprintk("--> %s\n", __func__);
+        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+         * right now covering the LAYOUTGET we are about to send.
+         * However, that is not so catastrophic, and there seems
+         * to be no way to prevent it completely.
+         */
        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
                                &lgp->res.seq_res, 0, task))
                return;
+        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+                                          NFS_I(lgp->args.inode)->layout,
+                                          lgp->args.ctx->state)) {
+                rpc_exit(task, NFS4_OK);
+                return;
+        }
        rpc_call_start(task);
 }
@@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                        return;
                }
        }
-        lgp->status = task->tk_status;
        dprintk("<-- %s\n", __func__);
 }
@@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(lgp->args.inode);
        if (lgp->res.layout.buf != NULL)
                free_page((unsigned long) lgp->res.layout.buf);
        put_nfs_open_context(lgp->args.ctx);
@@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
+        if (status == 0)
-                goto out;
+                status = task->tk_status;
-        status = lgp->status;
+        if (status == 0)
-        if (status != 0)
+                status = pnfs_layout_process(lgp);
-                goto out;
-        status = pnfs_layout_process(lgp);
-out:
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
@@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .open_context   = nfs4_atomic_open,
 };
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+        .prefix = XATTR_NAME_NFSV4_ACL,
+        .list   = nfs4_xattr_list_nfs4_acl,
+        .get    = nfs4_xattr_get_nfs4_acl,
+        .set    = nfs4_xattr_set_nfs4_acl,
+};
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+        &nfs4_xattr_nfs4_acl_handler,
+        NULL
+};
 /*
 * Local variables:
 *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af1..402143d75fc 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        /* Are there any active superblocks? */
-        if (list_empty(&clp->cl_superblocks))
+        rcu_read_lock();
+        if (list_empty(&clp->cl_superblocks)) {
+                rcu_read_unlock();
                goto out;
+        }
+        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (list_empty(&clp->cl_delegations)) {
+                        if (!nfs_delegations_present(clp)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a312673..2336d532cf6 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct rpc_cred *cred = NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                if (list_empty(&sp->so_states))
                        continue;
                cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+        struct rpc_cred *cred = NULL;
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                cred = nfs4_get_renew_cred_server_locked(server);
+                if (cred != NULL)
+                        break;
+        }
+        rcu_read_unlock();
+        return cred;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
-                INIT_COMPLETION(ses->complete);
+                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
-                return wait_for_completion_interruptible(&ses->complete);
+                return wait_for_completion_interruptible(&tbl->complete);
        }
        spin_unlock(&tbl->slot_tbl_lock);
        return 0;
 }
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int ret = 0;
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+        /* back channel */
+        ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+        if (ret)
+                return ret;
+        /* fore channel */
+        return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
@@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
+        status = nfs4_set_callback_sessionid(clp);
+        if (status != 0) {
+                printk(KERN_WARNING "Sessionid not set. No callback service\n");
+                nfs_callback_down(1);
+                status = 0;
+        }
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4_1 */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
+        spin_lock(&clp->cl_lock);
+        pos = rb_first(&server->state_owners);
+        if (pos != NULL) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+                cred = get_rpccred(sp->so_cred);
+        }
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+        struct nfs_server *server;
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
        cred = nfs4_get_machine_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred != NULL)
                goto out;
-        pos = rb_first(&clp->cl_state_owners);
-        if (pos != NULL) {
+        rcu_read_lock();
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                cred = get_rpccred(sp->so_cred);
+                cred = nfs4_get_setclientid_cred_server(server);
+                if (cred != NULL)
+                        break;
        }
+        rcu_read_unlock();
 out:
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                __u64 minval, int maxbits)
+                                       struct nfs_unique_id *new,
+                                       __u64 minval, int maxbits)
 {
        struct rb_node **p, *parent;
        struct nfs_unique_id *pos;
@@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs_client *clp = server->nfs_client;
+        struct rb_node **p = &server->state_owners.rb_node,
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp, *res = NULL;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
                if (server < sp->so_server) {
                        p = &parent->rb_left;
@@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
+        struct nfs_server *server = new->so_server;
+        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (new->so_server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (new->so_server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (new->so_cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (new->so_cred > sp->so_cred)
@@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->openowner_id,
-        rb_link_node(&new->so_client_node, parent, p);
+                                        &new->so_owner_id, 1, 64);
-        rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+        rb_link_node(&new->so_server_node, parent, p);
+        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
 }
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node))
+        struct nfs_server *server = sp->so_server;
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-        nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+        if (!RB_EMPTY_NODE(&sp->so_server_node))
+                rb_erase(&sp->so_server_node, &server->state_owners);
+        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 /*
@@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-                struct nfs_client *clp = sp->so_server->nfs_client;
+                struct nfs_server *server = sp->so_server;
+                struct nfs_client *clp = server->nfs_client;
                spin_lock(&clp->cl_lock);
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+                rb_erase(&sp->so_server_node, &server->state_owners);
-                RB_CLEAR_NODE(&sp->so_client_node);
+                RB_CLEAR_NODE(&sp->so_server_node);
                spin_unlock(&clp->cl_lock);
        }
 }
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+                                              struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_find_state_owner(server, cred);
+        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                return sp;
@@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_insert_state_owner(clp, new);
+        sp = nfs4_insert_state_owner_locked(new);
        spin_unlock(&clp->cl_lock);
        if (sp == new)
                get_rpccred(cred);
@@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner(clp, sp);
+        nfs4_remove_state_owner_locked(sp);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&sp->so_sequence.wait);
        put_rpccred(cred);
@@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else
+        } else {
-                nfs4_do_close(path, state, gfp_mask, wait);
+                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(path, state, gfp_mask, wait, roc);
+        }
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                return NULL;
        }
        spin_lock(&clp->cl_lock);
-        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
@@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
@@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        }
 }
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        /* Reset all sequence ids to zero */
+        spin_lock(&clp->cl_lock);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_reset_seqids(server, mark_reclaim);
+        rcu_read_unlock();
 }
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        spin_lock(&clp->cl_lock);
-                return 0;
+        for (pos = rb_first(&server->state_owners);
+             pos != NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+             pos = rb_next(pos)) {
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+                                                &state->flags))
                                continue;
                        nfs4_state_mark_reclaim_nograce(clp, state);
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_clear_reclaim_server(server);
+        rcu_read_unlock();
        nfs_delegation_reap_unclaimed(clp);
        return 1;
@@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+        struct nfs4_state_owner *sp;
+        struct nfs_server *server;
        struct rb_node *pos;
        int status = 0;
 restart:
-        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&clp->cl_lock);
-                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                for (pos = rb_first(&server->state_owners);
-                        continue;
+                     pos != NULL;
-                atomic_inc(&sp->so_count);
+                     pos = rb_next(pos)) {
-                spin_unlock(&clp->cl_lock);
+                        sp = rb_entry(pos,
-                status = nfs4_reclaim_open_state(sp, ops);
+                                struct nfs4_state_owner, so_server_node);
-                if (status < 0) {
+                        if (!test_and_clear_bit(ops->owner_flag_bit,
-                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                                        &sp->so_flags))
+                                continue;
+                        atomic_inc(&sp->so_count);
+                        spin_unlock(&clp->cl_lock);
+                        rcu_read_unlock();
+                        status = nfs4_reclaim_open_state(sp, ops);
+                        if (status < 0) {
+                                set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                nfs4_put_state_owner(sp);
+                                return nfs4_recovery_handle_error(clp, status);
+                        }
                        nfs4_put_state_owner(sp);
-                        return nfs4_recovery_handle_error(clp, status);
+                        goto restart;
                }
-                nfs4_put_state_owner(sp);
+                spin_unlock(&clp->cl_lock);
-                goto restart;
        }
-        spin_unlock(&clp->cl_lock);
+        rcu_read_unlock();
        return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e..2ab8e5cb8f5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
-#define open_owner_id_maxsz     (1 + 4)
+#define open_owner_id_maxsz     (1 + 1 + 4)
-#define lock_owner_id_maxsz     (1 + 4)
+#define lock_owner_id_maxsz     (1 + 1 + 4)
 #define decode_lockowner_maxsz  (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
        __be32 *p;
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, lowner->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        *p++ = cpu_to_be32(lowner->s_dev);
        xdr_encode_hyper(p, lowner->id);
 }
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        *p++ = cpu_to_be32(OP_OPEN);
        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "open id:", 8);
+        *p++ = cpu_to_be32(arg->server->s_dev);
        xdr_encode_hyper(p, arg->id);
 }
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_restorefh_maxsz;
 }
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        if (arg->acl_len % 4)
+        BUG_ON(arg->acl_len % 4);
-                return -EINVAL;
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
-        return 0;
 }
 static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
                      const struct nfs4_layoutget_args *args,
                      struct compound_hdr *hdr)
 {
-        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->range.offset);
        p = xdr_encode_hyper(p, args->range.length);
        p = xdr_encode_hyper(p, args->minlength);
-        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
-                                args->ctx->state);
-        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
        *p = cpu_to_be32(args->maxcount);
        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_accessargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_access(xdr, args->access, &hdr);
-        encode_access(&xdr, args->access, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_lookup_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_lookup_root_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_removeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_remove(xdr, &args->name, &hdr);
-        encode_remove(&xdr, &args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_renameargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->old_dir, &hdr);
-        encode_putfh(&xdr, args->old_dir, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->new_dir, &hdr);
-        encode_putfh(&xdr, args->new_dir, &hdr);
+        encode_rename(xdr, args->old_name, args->new_name, &hdr);
-        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             const struct nfs4_link_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_link(xdr, args->name, &hdr);
-        encode_link(&xdr, args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_create_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_create(xdr, args, &hdr);
-        encode_create(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_create_arg *args)
 {
-        return nfs4_xdr_enc_create(req, p, args);
+        nfs4_xdr_enc_create(req, xdr, args);
 }
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_getattr_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_close(xdr, args, &hdr);
-        encode_close(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs_open_confirmargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_confirm(xdr, args, &hdr);
-        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_downgrade(xdr, args, &hdr);
-        encode_open_downgrade(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_lock_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lock(xdr, args, &hdr);
-        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_lockt_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lockt(xdr, args, &hdr);
-        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_locku_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_locku(xdr, args, &hdr);
-        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+                                           struct xdr_stream *xdr,
+                                        struct nfs_release_lockowner_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(xdr, &args->lock_owner, &hdr);
-        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_readlink *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readlink(xdr, args, req, &hdr);
-        encode_readlink(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_readdir_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readdir(xdr, args, req, &hdr);
-        encode_readdir(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                         args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_readargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_read(xdr, args, &hdr);
-        encode_read(&xdr, args, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setattr(xdr, args, args->server, &hdr);
-        encode_setattr(&xdr, args, args->server, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a GETACL request
 */
-static int
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
+                                struct nfs_getaclargs *args)
-                struct nfs_getaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_write(xdr, args, &hdr);
-        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_commit(xdr, args, &hdr);
-        encode_commit(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs4_fsinfo_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_fsinfo(xdr, args->bitmask, &hdr);
-        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_pathconf_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_statfs_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
-                                    struct nfs4_server_caps_arg *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_server_caps_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_client *clp)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_renew(xdr, clp, &hdr);
-        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_setclientid *sc)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid(xdr, sc, &hdr);
-        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+                                             struct xdr_stream *xdr,
+                                             struct nfs4_setclientid_res *arg)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid_confirm(xdr, arg, &hdr);
-        encode_setclientid_confirm(&xdr, arg, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_delegreturnargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_delegreturn(xdr, args->stateid, &hdr);
-        encode_delegreturn(&xdr, args->stateid, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_fs_locations_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
        replen = hdr.replen;    /* get the attribute into args->page */
-        encode_fs_locations(&xdr, args->bitmask, &hdr);
+        encode_fs_locations(xdr, args->bitmask, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
-        return 0;
 }
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
 */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
-                                    struct nfs41_exchange_id_args *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_exchange_id_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(xdr, args, &hdr);
-        encode_exchange_id(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a CREATE_SESSION request
 */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
-                                       struct nfs41_create_session_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_create_session_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(xdr, args, &hdr);
-        encode_create_session(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a DESTROY_SESSION request
 */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
-                                        struct nfs4_session *session)
+                                         struct xdr_stream *xdr,
+                                         struct nfs4_session *session)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = session->clp->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(xdr, session, &hdr);
-        encode_destroy_session(&xdr, session, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SEQUENCE request
 */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
-                                 struct nfs4_sequence_args *args)
+                                  struct nfs4_sequence_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, args, &hdr);
-        encode_sequence(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a GET_LEASE_TIME request
 */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
-                                       struct nfs4_get_lease_time_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_get_lease_time_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->la_seq_args, &hdr);
-        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RECLAIM_COMPLETE request
 */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
-                                     struct nfs41_reclaim_complete_args *args)
+                                          struct xdr_stream *xdr,
+                                struct nfs41_reclaim_complete_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(xdr, args, &hdr);
-        encode_reclaim_complete(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode GETDEVICEINFO request
 */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
-                                      struct nfs4_getdeviceinfo_args *args)
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdeviceinfo_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(xdr, args, &hdr);
-        encode_getdeviceinfo(&xdr, args, &hdr);
        /* set up reply kvec. Subtract notification bitmap max size (2)
         * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
                         args->pdev->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  Encode LAYOUTGET request
 */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
-                                  struct nfs4_layoutget_args *args)
+                                   struct xdr_stream *xdr,
+                                   struct nfs4_layoutget_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(xdr, args, &hdr);
-        encode_layoutget(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
                goto out_overflow;
        eof = be32_to_cpup(p++);
        count = be32_to_cpup(p);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                goto out_overflow;
        len = be32_to_cpup(p);
        if (len) {
-                int i;
+                uint32_t i;
                p = xdr_inline_decode(xdr, 4 * len);
                if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5118,26 +5040,25 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_accessres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(&xdr, res);
+        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5146,26 +5067,28 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server
                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5174,23 +5097,25 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putrootfh(&xdr)) != 0)
+        status = decode_putrootfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) == 0)
+        status = decode_getfh(xdr, res->fh);
-                status = decode_getfattr(&xdr, res->fattr, res->server,
+        if (status == 0)
+                status = decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5199,24 +5124,25 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_removeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+        status = decode_remove(xdr, &res->cinfo);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5225,34 +5151,38 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_renameres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+        status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5261,37 +5191,41 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs4_link_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+        status = decode_link(xdr, &res->cinfo);
+        if (status)
                goto out;
        /*
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5300,33 +5234,37 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_create_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+        status = decode_create(xdr, &res->dir_cinfo);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5335,31 +5273,31 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_create_res *res)
 {
-        return nfs4_xdr_dec_create(rqstp, p, res);
+        return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_getattr_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5368,46 +5306,40 @@ out:
 /*
 * Encode an SETACL request
 */
-static int
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+                                struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setacl(xdr, args, &hdr);
-        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return status;
 }
 /*
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_setaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
 out:
        return status;
 }
@@ -5416,24 +5348,22 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_getaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -5442,23 +5372,22 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5476,36 +5405,35 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        if (decode_getfh(&xdr, &res->fh) != 0)
+        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if (decode_restorefh(&xdr) != 0)
+        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5514,20 +5442,20 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(xdr, res);
 out:
        return status;
 }
@@ -5535,26 +5463,26 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5563,26 +5491,26 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5591,23 +5519,22 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_lock_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lock(&xdr, res);
+        status = decode_lock(xdr, res);
 out:
        return status;
 }
@@ -5615,23 +5542,22 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_lockt_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lockt(&xdr, res);
+        status = decode_lockt(xdr, res);
 out:
        return status;
 }
@@ -5639,61 +5565,58 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_locku_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_locku(&xdr, res);
+        status = decode_locku(xdr, res);
 out:
        return status;
 }
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+                                          struct xdr_stream *xdr, void *dummy)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_release_lockowner(&xdr);
+                status = decode_release_lockowner(xdr);
        return status;
 }
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_readlink_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readlink(&xdr, rqstp);
+        status = decode_readlink(xdr, rqstp);
 out:
        return status;
 }
@@ -5701,23 +5624,22 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_readdir_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readdir(&xdr, rqstp, res);
+        status = decode_readdir(xdr, rqstp, res);
 out:
        return status;
 }
@@ -5725,23 +5647,22 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_readres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_read(&xdr, rqstp, res);
+        status = decode_read(xdr, rqstp, res);
        if (!status)
                status = res->count;
 out:
@@ -5751,26 +5672,25 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_write(&xdr, res);
+        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
@@ -5781,26 +5701,25 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_commit(&xdr, res);
+        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5809,85 +5728,80 @@ out:
 /*
 * Decode FSINFO response
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_fsinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->fsinfo);
+                status = decode_fsinfo(xdr, res->fsinfo);
        return status;
 }
 /*
 * Decode PATHCONF response
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs4_pathconf_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_pathconf(&xdr, res->pathconf);
+                status = decode_pathconf(xdr, res->pathconf);
        return status;
 }
 /*
 * Decode STATFS response
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_statfs_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_statfs(&xdr, res->fsstat);
+                status = decode_statfs(xdr, res->fsstat);
        return status;
 }
 /*
 * Decode GETATTR_BITMAP response
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_server_caps_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        status = decode_server_caps(&xdr, res);
+        status = decode_server_caps(xdr, res);
 out:
        return status;
 }
@@ -5895,79 +5809,77 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              void *__unused)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_renew(&xdr);
+                status = decode_renew(xdr);
        return status;
 }
 /*
 * Decode SETCLIENTID response
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
-                struct nfs4_setclientid_res *res)
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_setclientid_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, res);
+                status = decode_setclientid(xdr, res);
        return status;
 }
 /*
 * Decode SETCLIENTID_CONFIRM response
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+                                            struct xdr_stream *xdr,
+                                            struct nfs_fsinfo *fsinfo)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid_confirm(&xdr);
+                status = decode_setclientid_confirm(xdr);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(xdr, fsinfo);
        return status;
 }
 /*
 * Decode DELEGRETURN response
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_delegreturnres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_delegreturn(&xdr);
+        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5976,26 +5888,27 @@ out:
 /*
 * Decode FS_LOCATIONS response
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
                                     struct nfs4_fs_locations_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        xdr_enter_page(&xdr, PAGE_SIZE);
+        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+        status = decode_getfattr(xdr, &res->fs_locations->fattr,
                                 res->fs_locations->server,
                                 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -6006,129 +5919,122 @@ out:
 /*
 * Decode EXCHANGE_ID response
 */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
                                    void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_exchange_id(&xdr, res);
+                status = decode_exchange_id(xdr, res);
        return status;
 }
 /*
 * Decode CREATE_SESSION response
 */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs41_create_session_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_create_session(&xdr, res);
+                status = decode_create_session(xdr, res);
        return status;
 }
 /*
 * Decode DESTROY_SESSION response
 */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
-                                        void *dummy)
+                                        struct xdr_stream *xdr,
+                                        void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_destroy_session(&xdr, dummy);
+                status = decode_destroy_session(xdr, res);
        return status;
 }
 /*
 * Decode SEQUENCE response
 */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_sequence_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, res, rqstp);
+                status = decode_sequence(xdr, res, rqstp);
        return status;
 }
 /*
 * Decode GET_LEASE_TIME response
 */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs4_get_lease_time_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+                status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+                status = decode_fsinfo(xdr, res->lr_fsinfo);
        return status;
 }
 /*
 * Decode RECLAIM_COMPLETE response
 */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+                                         struct xdr_stream *xdr,
                                         struct nfs41_reclaim_complete_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(&xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, (void *)NULL);
        return status;
 }
 /*
 * Decode GETDEVINFO response
 */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
                                      struct nfs4_getdeviceinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status != 0)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status != 0)
                goto out;
-        status = decode_getdeviceinfo(&xdr, res->pdev);
+        status = decode_getdeviceinfo(xdr, res->pdev);
 out:
        return status;
 }
@@ -6136,31 +6042,44 @@ out:
 /*
 * Decode LAYOUTGET response
 */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
                                  struct nfs4_layoutget_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_layoutget(&xdr, rqstp, res);
+        status = decode_layoutget(xdr, rqstp, res);
 out:
        return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+/**
-                           struct nfs_server *server, int plus)
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
@@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                if (unlikely(!p))
                        goto out_overflow;
                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
        p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (decode_attr_length(xdr, &len, &p) < 0)
                goto out_overflow;
-        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+                                        entry->server, 1) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
@@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (verify_attr_len(xdr, p, len) < 0)
                goto out_overflow;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
 }
 /*
@@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
+        .p_encode = (kxdreproc_t)nfs4_xdr_##argtype,            \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
+        .p_decode = (kxdrdproc_t)nfs4_xdr_##restype,            \
        .p_arglen = NFS4_##argtype##_sz,                        \
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
@@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat)
 }
 struct rpc_procinfo     nfs4_procedures[] = {
-  PROC(READ,            enc_read,       dec_read),
+        PROC(READ,              enc_read,               dec_read),
-  PROC(WRITE,           enc_write,      dec_write),
+        PROC(WRITE,             enc_write,              dec_write),
-  PROC(COMMIT,          enc_commit,     dec_commit),
+        PROC(COMMIT,            enc_commit,             dec_commit),
-  PROC(OPEN,            enc_open,       dec_open),
+        PROC(OPEN,              enc_open,               dec_open),
-  PROC(OPEN_CONFIRM,    enc_open_confirm,       dec_open_confirm),
+        PROC(OPEN_CONFIRM,      enc_open_confirm,       dec_open_confirm),
-  PROC(OPEN_NOATTR,     enc_open_noattr,        dec_open_noattr),
+        PROC(OPEN_NOATTR,       enc_open_noattr,        dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,  enc_open_downgrade,     dec_open_downgrade),
+        PROC(OPEN_DOWNGRADE,    enc_open_downgrade,     dec_open_downgrade),
-  PROC(CLOSE,           enc_close,      dec_close),
+        PROC(CLOSE,             enc_close,              dec_close),
-  PROC(SETATTR,         enc_setattr,    dec_setattr),
+        PROC(SETATTR,           enc_setattr,            dec_setattr),
-  PROC(FSINFO,          enc_fsinfo,     dec_fsinfo),
+        PROC(FSINFO,            enc_fsinfo,             dec_fsinfo),
-  PROC(RENEW,           enc_renew,      dec_renew),
+        PROC(RENEW,             enc_renew,              dec_renew),
-  PROC(SETCLIENTID,     enc_setclientid,        dec_setclientid),
+        PROC(SETCLIENTID,       enc_setclientid,        dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,     enc_setclientid_confirm,        dec_setclientid_confirm),
+        PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
+        PROC(LOCK,              enc_lock,               dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
+        PROC(LOCKT,             enc_lockt,              dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
+        PROC(LOCKU,             enc_locku,              dec_locku),
-  PROC(ACCESS,          enc_access,     dec_access),
+        PROC(ACCESS,            enc_access,             dec_access),
-  PROC(GETATTR,         enc_getattr,    dec_getattr),
+        PROC(GETATTR,           enc_getattr,            dec_getattr),
-  PROC(LOOKUP,          enc_lookup,     dec_lookup),
+        PROC(LOOKUP,            enc_lookup,             dec_lookup),
-  PROC(LOOKUP_ROOT,     enc_lookup_root,        dec_lookup_root),
+        PROC(LOOKUP_ROOT,       enc_lookup_root,        dec_lookup_root),
-  PROC(REMOVE,          enc_remove,     dec_remove),
+        PROC(REMOVE,            enc_remove,             dec_remove),
-  PROC(RENAME,          enc_rename,     dec_rename),
+        PROC(RENAME,            enc_rename,             dec_rename),
-  PROC(LINK,            enc_link,       dec_link),
+        PROC(LINK,              enc_link,               dec_link),
-  PROC(SYMLINK,         enc_symlink,    dec_symlink),
+        PROC(SYMLINK,           enc_symlink,            dec_symlink),
-  PROC(CREATE,          enc_create,     dec_create),
+        PROC(CREATE,            enc_create,             dec_create),
-  PROC(PATHCONF,        enc_pathconf,   dec_pathconf),
+        PROC(PATHCONF,          enc_pathconf,           dec_pathconf),
-  PROC(STATFS,          enc_statfs,     dec_statfs),
+        PROC(STATFS,            enc_statfs,             dec_statfs),
-  PROC(READLINK,        enc_readlink,   dec_readlink),
+        PROC(READLINK,          enc_readlink,           dec_readlink),
-  PROC(READDIR,         enc_readdir,    dec_readdir),
+        PROC(READDIR,           enc_readdir,            dec_readdir),
-  PROC(SERVER_CAPS,     enc_server_caps, dec_server_caps),
+        PROC(SERVER_CAPS,       enc_server_caps,        dec_server_caps),
-  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
+        PROC(DELEGRETURN,       enc_delegreturn,        dec_delegreturn),
-  PROC(GETACL,          enc_getacl,     dec_getacl),
+        PROC(GETACL,            enc_getacl,             dec_getacl),
-  PROC(SETACL,          enc_setacl,     dec_setacl),
+        PROC(SETACL,            enc_setacl,             dec_setacl),
-  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
-  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
-  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+        PROC(DESTROY_SESSION,   enc_destroy_session,    dec_destroy_session),
-  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+        PROC(SEQUENCE,          enc_sequence,           dec_sequence),
-  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+        PROC(GET_LEASE_TIME,    enc_get_lease_time,     dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
-  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
-  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
+        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b68536cc904..e1164e3f9e6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-        struct nfs_page *p;
+        struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
-        p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+        if (p)
-        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
-        }
        return p;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95..bc408976973 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 * pNFS client layout cache
 */
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        atomic_inc(&lo->plh_refcount);
+}
 static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-        lo->refcount++;
+        BUG_ON(!list_empty(&lo->plh_layouts));
+        NFS_I(lo->plh_inode)->layout = NULL;
+        kfree(lo);
 }
 static void
 put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        if (atomic_dec_and_test(&lo->plh_refcount))
-        BUG_ON(lo->refcount == 0);
+                destroy_layout_hdr(lo);
-        lo->refcount--;
-        if (!lo->refcount) {
-                dprintk("%s: freeing layout cache %p\n", __func__, lo);
-                BUG_ON(!list_empty(&lo->layouts));
-                NFS_I(lo->inode)->layout = NULL;
-                kfree(lo);
-        }
 }
 void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        spin_lock(&inode->i_lock);
+        struct inode *inode = lo->plh_inode;
-        put_layout_hdr_locked(NFS_I(inode)->layout);
-        spin_unlock(&inode->i_lock);
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                destroy_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+        }
 }
 static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
-        INIT_LIST_HEAD(&lseg->fi_list);
+        INIT_LIST_HEAD(&lseg->pls_list);
-        kref_init(&lseg->kref);
+        atomic_set(&lseg->pls_refcount, 1);
-        lseg->layout = lo;
+        smp_mb();
+        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+        lseg->pls_layout = lo;
 }
-/* Called without i_lock held, as the free_lseg call may sleep */
+static void free_lseg(struct pnfs_layout_segment *lseg)
-static void
-destroy_lseg(struct kref *kref)
 {
-        struct pnfs_layout_segment *lseg =
+        struct inode *ino = lseg->pls_layout->plh_inode;
-                container_of(kref, struct pnfs_layout_segment, kref);
-        struct inode *ino = lseg->layout->inode;
-        dprintk("--> %s\n", __func__);
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        /* Matched by get_layout_hdr in pnfs_insert_layout */
-        put_layout_hdr(ino);
+        put_layout_hdr(NFS_I(ino)->layout);
 }
-static void
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
-put_lseg(struct pnfs_layout_segment *lseg)
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+                struct list_head *tmp_list)
+{
+        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+                atomic_read(&lseg->pls_refcount),
+                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                struct inode *ino = lseg->pls_layout->plh_inode;
+                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                list_del(&lseg->pls_list);
+                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                        struct nfs_client *clp;
+                        clp = NFS_SERVER(ino)->nfs_client;
+                        spin_lock(&clp->cl_lock);
+                        /* List does not take a reference, so no need for put here */
+                        list_del_init(&lseg->pls_layout->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+                }
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                list_add(&lseg->pls_list, tmp_list);
+                return 1;
+        }
+        return 0;
+}
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
-        if (!lseg)
+        return (recall_iomode == IOMODE_ANY ||
-                return;
+                lseg_iomode == recall_iomode);
+}
-        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+/* Returns 1 if lseg is removed from list, 0 otherwise */
-                atomic_read(&lseg->kref.refcount));
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
-        kref_put(&lseg->kref, destroy_lseg);
+                             struct list_head *tmp_list)
+{
+        int rv = 0;
+        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+                /* Remove the reference keeping the lseg in the
+                 * list.  It will now be removed when all
+                 * outstanding io is finished.
+                 */
+                rv = put_lseg_locked(lseg, tmp_list);
+        }
+        return rv;
 }
-static void
+/* Returns count of number of matching invalid lsegs remaining in list
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                            struct list_head *tmp_list,
+                            u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *next;
-        struct nfs_client *clp;
+        int invalid = 0, removed = 0;
        dprintk("%s:Begin lo %p\n", __func__, lo);
-        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
-                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                        dprintk("%s: freeing lseg %p iomode %d "
-                list_move(&lseg->fi_list, tmp_list);
+                                "offset %llu length %llu\n", __func__,
-        }
+                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-        clp = NFS_SERVER(lo->inode)->nfs_client;
+                                lseg->pls_range.length);
-        spin_lock(&clp->cl_lock);
+                        invalid++;
-        /* List does not take a reference, so no need for put here */
+                        removed += mark_lseg_invalid(lseg, tmp_list);
-        list_del_init(&lo->layouts);
+                }
-        spin_unlock(&clp->cl_lock);
+        dprintk("%s:Return %i\n", __func__, invalid - removed);
-        write_seqlock(&lo->seqlock);
+        return invalid - removed;
-        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("%s:Return\n", __func__);
 }
-static void
+void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
 {
-        struct pnfs_layout_segment *lseg;
+        struct pnfs_layout_segment *lseg, *tmp;
-        while (!list_empty(tmp_list)) {
+        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
-                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                list_del(&lseg->pls_list);
-                                fi_list);
+                free_lseg(lseg);
-                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-                list_del(&lseg->fi_list);
-                put_lseg(lseg);
        }
 }
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                pnfs_clear_lseg_list(lo, &tmp_list);
+                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
                put_layout_hdr_locked(lo);
        }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        while (!list_empty(&tmp_list)) {
                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-                                layouts);
+                                plh_layouts);
                dprintk("%s freeing layout for inode %lu\n", __func__,
-                        lo->inode->i_ino);
+                        lo->plh_inode->i_ino);
-                pnfs_destroy_layout(NFS_I(lo->inode));
+                pnfs_destroy_layout(NFS_I(lo->plh_inode));
        }
 }
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent */
- *
+void
- * lo->stateid could be the open stateid, in which case we just use what given.
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- */
+                        bool update_barrier)
-static void
+{
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+        u32 oldseq, newseq;
-                        const nfs4_stateid *new)
-{
+        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
-        nfs4_stateid *old = &lo->stateid;
+        newseq = be32_to_cpu(new->stateid.seqid);
-        bool overwrite = false;
+        if ((int)(newseq - oldseq) > 0) {
+                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
-        write_seqlock(&lo->seqlock);
+                if (update_barrier) {
-        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
-            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-                overwrite = true;
+                        if ((int)(new_barrier - lo->plh_barrier))
-        else {
+                                lo->plh_barrier = new_barrier;
-                u32 oldseq, newseq;
+                } else {
+                        /* Because of wraparound, we want to keep the barrier
-                oldseq = be32_to_cpu(old->stateid.seqid);
+                         * "close" to the current seqids.  It needs to be
-                newseq = be32_to_cpu(new->stateid.seqid);
+                         * within 2**31 to count as "behind", so if it
-                if ((int)(newseq - oldseq) > 0)
+                         * gets too near that limit, give us a litle leeway
-                        overwrite = true;
+                         * and bring it to within 2**30.
+                         * NOTE - and yes, this is all unsigned arithmetic.
+                         */
+                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                                lo->plh_barrier = newseq - (1 << 30);
+                }
        }
-        if (overwrite)
-                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-        write_sequnlock(&lo->seqlock);
 }
-static void
+/* lget is set to 1 if called from inside send_layoutget call chain */
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+static bool
-                              struct nfs4_state *state)
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+                        int lget)
 {
-        int seq;
+        if ((stateid) &&
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
-        dprintk("--> %s\n", __func__);
+                return true;
-        write_seqlock(&lo->seqlock);
+        return lo->plh_block_lgets ||
-        do {
+                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-                seq = read_seqbegin(&state->seqlock);
+                (list_empty(&lo->plh_segs) &&
-                memcpy(lo->stateid.data, state->stateid.data,
+                 (atomic_read(&lo->plh_outstanding) > lget));
-                       sizeof(state->stateid.data));
-        } while (read_seqretry(&state->seqlock, seq));
-        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("<-- %s\n", __func__);
 }
-void
+int
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                        struct nfs4_state *open_state)
+                              struct nfs4_state *open_state)
 {
-        int seq;
+        int status = 0;
        dprintk("--> %s\n", __func__);
-        do {
+        spin_lock(&lo->plh_inode->i_lock);
-                seq = read_seqbegin(&lo->seqlock);
+        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
-                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                status = -EAGAIN;
-                        /* This will trigger retry of the read */
+        } else if (list_empty(&lo->plh_segs)) {
-                        pnfs_layout_from_open_stateid(lo, open_state);
+                int seq;
-                } else
-                        memcpy(dst->data, lo->stateid.data,
+                do {
-                               sizeof(lo->stateid.data));
+                        seq = read_seqbegin(&open_state->seqlock);
-        } while (read_seqretry(&lo->seqlock, seq));
+                        memcpy(dst->data, open_state->stateid.data,
+                               sizeof(open_state->stateid.data));
+                } while (read_seqretry(&open_state->seqlock, seq));
+        } else
+                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
+        return status;
 }
 /*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
           u32 iomode)
 {
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        BUG_ON(ctx == NULL);
        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-        if (lgp == NULL) {
+        if (lgp == NULL)
-                put_layout_hdr(lo->inode);
                return NULL;
-        }
        lgp->args.minlength = NFS4_MAX_UINT64;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-                set_bit(lo_fail_bit(iomode), &lo->state);
+                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
        }
        return lseg;
 }
+bool pnfs_roc(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg, *tmp;
+        LIST_HEAD(tmp_list);
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+                goto out_nolayout;
+        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        mark_lseg_invalid(lseg, &tmp_list);
+                        found = true;
+                }
+        if (!found)
+                goto out_nolayout;
+        lo->plh_block_lgets++;
+        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        return true;
+out_nolayout:
+        spin_unlock(&ino->i_lock);
+        return false;
+}
+void pnfs_roc_release(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        lo->plh_block_lgets--;
+        put_layout_hdr_locked(lo);
+        spin_unlock(&ino->i_lock);
+}
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if ((int)(barrier - lo->plh_barrier) > 0)
+                lo->plh_barrier = barrier;
+        spin_unlock(&ino->i_lock);
+}
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_segment *lseg;
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        found = true;
+                        break;
+                }
+        if (!found) {
+                struct pnfs_layout_hdr *lo = nfsi->layout;
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                /* Since close does not return a layout stateid for use as
+                 * a barrier, we choose the worst-case barrier.
+                 */
+                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        }
+        spin_unlock(&ino->i_lock);
+        return found;
+}
 /*
 * Compare two layout segments for sorting into layout cache.
 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        if (list_empty(&lo->segs)) {
+        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
-                spin_lock(&clp->cl_lock);
-                BUG_ON(!list_empty(&lo->layouts));
-                list_add_tail(&lo->layouts, &clp->cl_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
-        list_for_each_entry(lp, &lo->segs, fi_list) {
-                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
                        continue;
-                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu before "
                        "lp %p iomode %d offset %llu length %llu\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length,
+                        lseg->pls_range.offset, lseg->pls_range.length,
-                        lp, lp->range.iomode, lp->range.offset,
+                        lp, lp->pls_range.iomode, lp->pls_range.offset,
-                        lp->range.length);
+                        lp->pls_range.length);
                found = 1;
                break;
        }
        if (!found) {
-                list_add_tail(&lseg->fi_list, &lo->segs);
+                list_add_tail(&lseg->pls_list, &lo->plh_segs);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu at tail\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length);
+                        lseg->pls_range.offset, lseg->pls_range.length);
        }
-        get_layout_hdr_locked(lo);
+        get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
 }
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
        if (!lo)
                return NULL;
-        lo->refcount = 1;
+        atomic_set(&lo->plh_refcount, 1);
-        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->plh_layouts);
-        INIT_LIST_HEAD(&lo->segs);
+        INIT_LIST_HEAD(&lo->plh_segs);
-        seqlock_init(&lo->seqlock);
+        INIT_LIST_HEAD(&lo->plh_bulk_recall);
-        lo->inode = ino;
+        lo->plh_inode = ino;
        return lo;
 }
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
        assert_spin_locked(&ino->i_lock);
-        if (nfsi->layout)
+        if (nfsi->layout) {
-                return nfsi->layout;
+                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+                        return NULL;
+                else
+                        return nfsi->layout;
+        }
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino);
        spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
 static int
 is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 {
-        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 }
 /*
 * lookup range in layout
 */
 static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        list_for_each_entry(lseg, &lo->segs, fi_list) {
+        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
-                if (is_matching_lseg(lseg, iomode)) {
+                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    is_matching_lseg(lseg, iomode)) {
                        ret = lseg;
                        break;
                }
-                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
                        break;
        }
        dprintk("%s:Return lseg %p ref %d\n",
-                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
        return ret;
 }
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
                   enum pnfs_iomode iomode)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
+        /* Do we even need to bother with this? */
-        lseg = pnfs_has_layout(lo, iomode);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
-        if (lseg) {
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                dprintk("%s matches recall, use MDS\n", __func__);
-                        __func__, lseg, iomode);
                goto out_unlock;
        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
-        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+                goto out_unlock;
+        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
+        atomic_inc(&lo->plh_outstanding);
-        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        get_layout_hdr(lo);
+        if (list_empty(&lo->plh_segs)) {
+                /* The lo must be on the clp list if there is any
+                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+                 */
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->plh_layouts));
+                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
+        if (!lseg) {
+                spin_lock(&ino->i_lock);
+                if (list_empty(&lo->plh_segs)) {
+                        spin_lock(&clp->cl_lock);
+                        list_del_init(&lo->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                }
+                spin_unlock(&ino->i_lock);
+        }
+        atomic_dec(&lo->plh_outstanding);
+        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->state, lseg);
+                nfsi->layout->plh_flags, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
+        /* Verify we got what we asked for.
+         * Note that because the xdr parsing only accepts a single
+         * element array, this can fail even if the server is behaving
+         * correctly.
+         */
+        if (lgp->args.range.iomode > res->range.iomode ||
+            res->range.offset != 0 ||
+            res->range.length != NFS4_MAX_UINT64) {
+                status = -EINVAL;
+                goto out;
+        }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
        if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_lock(&ino->i_lock);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s forget reply due to recall\n", __func__);
+                goto out_forget_reply;
+        }
+        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+                dprintk("%s forget reply due to state\n", __func__);
+                goto out_forget_reply;
+        }
        init_lseg(lo, lseg);
-        lseg->range = res->range;
+        lseg->pls_range = res->range;
        *lgp->lsegpp = lseg;
        pnfs_insert_layout(lo, lseg);
+        if (res->return_on_close) {
+                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+        }
        /* Done processing layoutget. Set the layout stateid */
-        pnfs_set_layout_stateid(lo, &res->stateid);
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
 out:
        return status;
+out_forget_reply:
+        spin_unlock(&ino->i_lock);
+        lseg->pls_layout = lo;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        goto out;
 }
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d5048..e2612ea0cbe 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+enum {
+        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
+        NFS_LSEG_ROC,           /* roc bit received from server */
+};
 struct pnfs_layout_segment {
-        struct list_head fi_list;
+        struct list_head pls_list;
-        struct pnfs_layout_range range;
+        struct pnfs_layout_range pls_range;
-        struct kref kref;
+        atomic_t pls_refcount;
-        struct pnfs_layout_hdr *layout;
+        unsigned long pls_flags;
+        struct pnfs_layout_hdr *pls_layout;
 };
 #ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
-        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
+        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
+        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
 /* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
 };
 struct pnfs_layout_hdr {
-        unsigned long           refcount;
+        atomic_t                plh_refcount;
-        struct list_head        layouts;   /* other client layouts */
+        struct list_head        plh_layouts;   /* other client layouts */
-        struct list_head        segs;      /* layout segments list */
+        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
-        seqlock_t               seqlock;   /* Protects the stateid */
+        struct list_head        plh_segs;      /* layout segments list */
-        nfs4_stateid            stateid;
+        nfs4_stateid            plh_stateid;
-        unsigned long           state;
+        atomic_t                plh_outstanding; /* number of RPCs out */
-        struct inode            *inode;
+        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
+        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_flags;
+        struct inode            *plh_inode;
 };
 struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-                             struct nfs4_state *open_state);
+                             const nfs4_stateid *new,
+                             bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+                                  struct pnfs_layout_hdr *lo,
+                                  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
        return NULL;
 }
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+        return false;
+}
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        return false;
+}
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1f..77d5e21c4ad 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        fattr = nfs_alloc_fattr();
        status = -ENOMEM;
        if (fh == NULL || fattr == NULL)
-                goto out;
+                goto out_free;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        if (status == 0)
                status = nfs_instantiate(dentry, fh, fattr);
+out_free:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .statfs         = nfs_proc_statfs,
        .fsinfo         = nfs_proc_fsinfo,
        .pathconf       = nfs_proc_pathconf,
-        .decode_dirent  = nfs_decode_dirent,
+        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4100630c9a5..0f9ea73e778 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_version || showdefaults)
                seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-        if (nfss->mountd_port || showdefaults)
+        if ((nfss->mountd_port &&
+                nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+                showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
        nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -2494,7 +2496,13 @@ static void nfs4_clone_super(struct super_block *sb,
        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
        sb->s_op = old_sb->s_op;
-        nfs_initialise_sb(sb);
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr  = old_sb->s_xattr;
+        nfs_initialise_sb(sb);
 }
 /*
@@ -2504,6 +2512,12 @@ static void nfs4_fill_super(struct super_block *sb)
 {
        sb->s_time_gran = 1;
        sb->s_op = &nfs4_sops;
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr = nfs4_xattr_handlers;
        nfs_initialise_sb(sb);
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7bdec853140..e313a51acdd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return ERR_PTR(-ENOMEM);
-        task_setup_data.callback_data = data,
+        task_setup_data.callback_data = data;
        data->cred = rpc_lookup_cred();
        if (IS_ERR(data->cred)) {
@@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                atomic_read(&dentry->d_count));
+                dentry->d_count);
        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
        /*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7..21a63da305f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
-enum nfs_cb_opnum4 {
-        OP_CB_RECALL            = 4,
-        OP_CB_SEQUENCE          = 11,
-};
 #define NFS4_MAXTAGLEN          20
 #define NFS4_enc_cb_null_sz             0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-        int tmp = XDR_QUADLEN(nbytes);
-        if (!tmp)
-                return p;
-        p[tmp-1] = 0;
-        memcpy(p, ptr, nbytes);
-        return p + tmp;
-}
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-        BUG_ON(!p);                                             \
-} while (0)
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-        status = 0;                             \
-out:                                            \
-        return status;                          \
-xdr_error:                                      \
-        dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-        status = -EIO;                          \
-        goto out
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (!p) { \
-                dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-                        __func__, __LINE__); \
-                return -EIO; \
-        } \
-} while (0)
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
        int             status;
 };
-static struct {
+/*
-int stat;
+ * Handle decode buffer overflows out-of-line.
-int errno;
+ */
-} nfs_cb_errtbl[] = {
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-        { NFS4_OK,              0               },
+{
-        { NFS4ERR_PERM,         EPERM           },
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        { NFS4ERR_NOENT,        ENOENT          },
+                "Remaining buffer length is %tu words.\n",
-        { NFS4ERR_IO,           EIO             },
+                func, xdr->end - xdr->p);
-        { NFS4ERR_NXIO,         ENXIO           },
+}
-        { NFS4ERR_ACCESS,       EACCES          },
-        { NFS4ERR_EXIST,        EEXIST          },
-        { NFS4ERR_XDEV,         EXDEV           },
-        { NFS4ERR_NOTDIR,       ENOTDIR         },
-        { NFS4ERR_ISDIR,        EISDIR          },
-        { NFS4ERR_INVAL,        EINVAL          },
-        { NFS4ERR_FBIG,         EFBIG           },
-        { NFS4ERR_NOSPC,        ENOSPC          },
-        { NFS4ERR_ROFS,         EROFS           },
-        { NFS4ERR_MLINK,        EMLINK          },
-        { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
-        { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
-        { NFS4ERR_DQUOT,        EDQUOT          },
-        { NFS4ERR_STALE,        ESTALE          },
-        { NFS4ERR_BADHANDLE,    EBADHANDLE      },
-        { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
-        { NFS4ERR_NOTSUPP,      ENOTSUPP        },
-        { NFS4ERR_TOOSMALL,     ETOOSMALL       },
-        { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
-        { NFS4ERR_BADTYPE,      EBADTYPE        },
-        { NFS4ERR_LOCKED,       EAGAIN          },
-        { NFS4ERR_RESOURCE,     EREMOTEIO       },
-        { NFS4ERR_SYMLINK,      ELOOP           },
-        { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
-        { NFS4ERR_DEADLOCK,     EDEADLK         },
-        { -1,                   EIO             }
-};
-static int
+static __be32 *xdr_encode_empty_array(__be32 *p)
-nfs_cb_stat_to_errno(int stat)
 {
-        int i;
+        *p++ = xdr_zero;
-        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+        return p;
-                if (nfs_cb_errtbl[i].stat == stat)
-                        return nfs_cb_errtbl[i].errno;
-        }
-        /* If we cannot translate the error, the recovery routines should
-        * handle it.
-        * Note: remaining NFSv4 error codes have values > 10000, so should
-        * not conflict with native Linux error codes.
-        */
-        return stat;
 }
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
+ */
+/*
+ *      nfs_cb_opnum4
+ *
+ *      enum nfs_cb_opnum4 {
+ *              OP_CB_GETATTR           = 3,
+ *                ...
+ *      };
 */
+enum nfs_cb_opnum4 {
+        OP_CB_GETATTR                   = 3,
+        OP_CB_RECALL                    = 4,
+        OP_CB_LAYOUTRECALL              = 5,
+        OP_CB_NOTIFY                    = 6,
+        OP_CB_PUSH_DELEG                = 7,
+        OP_CB_RECALL_ANY                = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL      = 9,
+        OP_CB_RECALL_SLOT               = 10,
+        OP_CB_SEQUENCE                  = 11,
+        OP_CB_WANTS_CANCELLED           = 12,
+        OP_CB_NOTIFY_LOCK               = 13,
+        OP_CB_NOTIFY_DEVICEID           = 14,
+        OP_CB_ILLEGAL                   = 10044
+};
-static void
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
 {
        __be32 *p;
-        RESERVE_SPACE(sizeof(stateid_t));
+        p = xdr_reserve_space(xdr, 4);
-        WRITE32(sid->si_generation);
+        *p = cpu_to_be32(op);
-        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
 }
-static void
+/*
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+ * nfs_fh4
+ *
+ *      typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-        __be32 * p;
+        u32 length = fh->fh_size;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        BUG_ON(length > NFS4_FHSIZE);
-        WRITE32(0);            /* tag length is always 0 */
+        p = xdr_reserve_space(xdr, 4 + length);
-        WRITE32(hdr->minorversion);
+        xdr_encode_opaque(p, &fh->fh_base, length);
-        WRITE32(hdr->ident);
-        hdr->nops_p = p;
-        WRITE32(hdr->nops);
 }
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *      struct stateid4 {
+ *              uint32_t        seqid;
+ *              opaque          other[12];
+ *      };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-        *hdr->nops_p = htonl(hdr->nops);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(sid->si_generation);
+        xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
-static void
+/*
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+ * sessionid4
-                struct nfs4_cb_compound_hdr *hdr)
+ *
+ *      typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+                              const struct nfsd4_session *session)
 {
        __be32 *p;
-        int len = dp->dl_fh.fh_size;
+        p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
-        RESERVE_SPACE(4);
+        xdr_encode_opaque_fixed(p, session->se_sessionid.data,
-        WRITE32(OP_CB_RECALL);
+                                        NFS4_MAX_SESSIONID_LEN);
-        encode_stateid(xdr, &dp->dl_stateid);
-        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-        WRITE32(0); /* truncate optimization not implemented */
-        WRITE32(len);
-        WRITEMEM(&dp->dl_fh.fh_base, len);
-        hdr->nops++;
 }
-static void
+/*
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+ * nfsstat4
-                   struct nfs4_cb_compound_hdr *hdr)
+ */
-{
+static const struct {
-        __be32 *p;
+        int stat;
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        int errno;
+} nfs_cb_errtbl[] = {
+        { NFS4_OK,              0               },
+        { NFS4ERR_PERM,         -EPERM          },
+        { NFS4ERR_NOENT,        -ENOENT         },
+        { NFS4ERR_IO,           -EIO            },
+        { NFS4ERR_NXIO,         -ENXIO          },
+        { NFS4ERR_ACCESS,       -EACCES         },
+        { NFS4ERR_EXIST,        -EEXIST         },
+        { NFS4ERR_XDEV,         -EXDEV          },
+        { NFS4ERR_NOTDIR,       -ENOTDIR        },
+        { NFS4ERR_ISDIR,        -EISDIR         },
+        { NFS4ERR_INVAL,        -EINVAL         },
+        { NFS4ERR_FBIG,         -EFBIG          },
+        { NFS4ERR_NOSPC,        -ENOSPC         },
+        { NFS4ERR_ROFS,         -EROFS          },
+        { NFS4ERR_MLINK,        -EMLINK         },
+        { NFS4ERR_NAMETOOLONG,  -ENAMETOOLONG   },
+        { NFS4ERR_NOTEMPTY,     -ENOTEMPTY      },
+        { NFS4ERR_DQUOT,        -EDQUOT         },
+        { NFS4ERR_STALE,        -ESTALE         },
+        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
+        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
+        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
+        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
+        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_BADTYPE,      -EBADTYPE       },
+        { NFS4ERR_LOCKED,       -EAGAIN         },
+        { NFS4ERR_RESOURCE,     -EREMOTEIO      },
+        { NFS4ERR_SYMLINK,      -ELOOP          },
+        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
+        { NFS4ERR_DEADLOCK,     -EDEADLK        },
+        { -1,                   -EIO            }
+};
-        if (hdr->minorversion == 0)
+/*
-                return;
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+        int i;
-        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+                if (nfs_cb_errtbl[i].stat == status)
+                        return nfs_cb_errtbl[i].errno;
+        }
-        WRITE32(OP_CB_SEQUENCE);
+        dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
-        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        return -status;
-        WRITE32(ses->se_cb_seq_nr);
-        WRITE32(0);             /* slotid, always 0 */
-        WRITE32(0);             /* highest slotid always 0 */
-        WRITE32(0);             /* cachethis always 0 */
-        WRITE32(0); /* FIXME: support referring_call_lists */
-        hdr->nops++;
 }
-static int
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+                               enum nfsstat4 *status)
 {
-        struct xdr_stream xdrs, *xdr = &xdrs;
+        __be32 *p;
+        u32 op;
-        xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        RESERVE_SPACE(0);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        op = be32_to_cpup(p++);
+        if (unlikely(op != expected))
+                goto out_unexpected;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_unexpected:
+        dprintk("NFSD: Callback server returned operation %d but "
+                "we issued a request for %d\n", op, expected);
+        return -EIO;
 }
-static int
+/*
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ * CB_COMPOUND4args
-                struct nfsd4_callback *cb)
+ *
+ *      struct CB_COMPOUND4args {
+ *              utf8str_cs      tag;
+ *              uint32_t        minorversion;
+ *              uint32_t        callback_ident;
+ *              nfs_cb_argop4   argarray<>;
+ *      };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
-        struct xdr_stream xdr;
+        __be32 * p;
-        struct nfs4_delegation *args = cb->cb_op;
-        struct nfs4_cb_compound_hdr hdr = {
-                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = cb->cb_minorversion,
-        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        encode_cb_compound_hdr(&xdr, &hdr);
+        p = xdr_encode_empty_array(p);          /* empty tag */
-        encode_cb_sequence(&xdr, cb, &hdr);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        encode_cb_recall(&xdr, args, &hdr);
+        *p++ = cpu_to_be32(hdr->ident);
-        encode_cb_nops(&hdr);
+        hdr->nops_p = p;
+        *p = cpu_to_be32(hdr->nops);            /* argarray element count */
+}
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+        BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+        *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+/*
+ * CB_COMPOUND4res
+ *
+ *      struct CB_COMPOUND4res {
+ *              nfsstat4        status;
+ *              utf8str_cs      tag;
+ *              nfs_cb_resop4   resarray<>;
+ *      };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        /* Ignore the tag */
+        length = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, length + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->nops = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * CB_RECALL4args
+ *
+ *      struct CB_RECALL4args {
+ *              stateid4        stateid;
+ *              bool            truncate;
+ *              nfs_fh4         fh;
+ *      };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+                                  const struct nfs4_delegation *dp,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+        encode_stateid4(xdr, &dp->dl_stateid);
+        p = xdr_reserve_space(xdr, 4);
+        *p++ = xdr_zero;                        /* truncate */
-static int
+        encode_nfs_fh4(xdr, &dp->dl_fh);
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        __be32 *p;
-        u32 taglen;
-        READ_BUF(8);
+        hdr->nops++;
-        READ32(hdr->status);
-        /* We've got no use for the tag; ignore it: */
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
 }
-static int
+/*
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+ * CB_SEQUENCE4args
+ *
+ *      struct CB_SEQUENCE4args {
+ *              sessionid4              csa_sessionid;
+ *              sequenceid4             csa_sequenceid;
+ *              slotid4                 csa_slotid;
+ *              slotid4                 csa_highest_slotid;
+ *              bool                    csa_cachethis;
+ *              referring_call_list4    csa_referring_call_lists<>;
+ *      };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+                                    const struct nfsd4_callback *cb,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        __be32 *p;
-        u32 op;
-        int32_t nfserr;
+        if (hdr->minorversion == 0)
+                return;
-        READ_BUF(8);
-        READ32(op);
+        encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
-        if (op != expected) {
+        encode_sessionid4(xdr, session);
-                dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-                         " operation %d but we issued a request for %d\n",
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-                         op, expected);
+        *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
-                return -EIO;
+        *p++ = xdr_zero;                        /* csa_slotid */
-        }
+        *p++ = xdr_zero;                        /* csa_highest_slotid */
-        READ32(nfserr);
+        *p++ = xdr_zero;                        /* csa_cachethis */
-        if (nfserr != NFS_OK)
+        xdr_encode_empty_array(p);              /* csa_referring_call_lists */
-                return -nfs_cb_stat_to_errno(nfserr);
-        return 0;
+        hdr->nops++;
 }
 /*
+ * CB_SEQUENCE4resok
+ *
+ *      struct CB_SEQUENCE4resok {
+ *              sessionid4      csr_sessionid;
+ *              sequenceid4     csr_sequenceid;
+ *              slotid4         csr_slotid;
+ *              slotid4         csr_highest_slotid;
+ *              slotid4         csr_target_highest_slotid;
+ *      };
+ *
+ *      union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *      case NFS4_OK:
+ *              CB_SEQUENCE4resok       csr_resok4;
+ *      default:
+ *              void;
+ *      };
+ *
 * Our current back channel implmentation supports a single backchannel
 * with a single slot.
 */
-static int
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+                                    struct nfsd4_callback *cb)
-                   struct rpc_rqst *rqstp)
 {
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
-        u32 dummy;
        __be32 *p;
+        u32 dummy;
-        if (cb->cb_minorversion == 0)
+        status = -ESERVERFAULT;
-                return 0;
-        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-        if (status)
-                return status;
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        if (unlikely(p == NULL))
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, session->se_sessionid.data,
-        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
+                                        NFS4_MAX_SESSIONID_LEN) != 0) {
-                dprintk("%s Invalid session id\n", __func__);
+                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
-        READ32(dummy);
+        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (dummy != ses->se_cb_seq_nr) {
-                dprintk("%s Invalid sequence number\n", __func__);
+        dummy = be32_to_cpup(p++);
+        if (dummy != session->se_cb_seq_nr) {
+                dprintk("NFS: %s Invalid sequence number\n", __func__);
                goto out;
        }
-        READ32(dummy);  /* slotid must be 0 */
+        dummy = be32_to_cpup(p++);
        if (dummy != 0) {
-                dprintk("%s Invalid slotid\n", __func__);
+                dprintk("NFS: %s Invalid slotid\n", __func__);
                goto out;
        }
-        /* FIXME: process highest slotid and target highest slotid */
+        /*
+         * FIXME: process highest slotid and target highest slotid
+         */
        status = 0;
 out:
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        enum nfsstat4 nfserr;
+        int status;
+        if (cb->cb_minorversion == 0)
+                return 0;
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
+        status = decode_cb_sequence4resok(xdr, cb);
+out:
+        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
+}
-static int
+/*
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 void *__unused)
+{
+        xdr_reserve_space(xdr, 0);
+}
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_delegation *args = cb->cb_op;
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = cb->cb_clp->cl_cb_ident,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                void *__unused)
 {
        return 0;
 }
-static int
+/*
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
-                struct nfsd4_callback *cb)
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
 {
-        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_cb_compound4res(xdr, &hdr);
-        status = decode_cb_compound_hdr(&xdr, &hdr);
+        if (unlikely(status))
-        if (status)
                goto out;
-        if (cb) {
-                status = decode_cb_sequence(&xdr, cb, rqstp);
+        if (cb != NULL) {
-                if (status)
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
                        goto out;
        }
-        status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
 out:
        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
 }
 /*
 * RPC procedure tables
 */
-#define PROC(proc, call, argtype, restype)                              \
+#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                              \
+[NFSPROC4_CLNT_##proc] = {                                              \
-        .p_proc   = NFSPROC4_CB_##call,                                 \
+        .p_proc    = NFSPROC4_CB_##call,                                \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
+        .p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,               \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
+        .p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,               \
-        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_arglen  = NFS4_enc_##argtype##_sz,                           \
-        .p_replen = NFS4_##restype##_sz,                                \
+        .p_replen  = NFS4_dec_##restype##_sz,                           \
-        .p_statidx = NFSPROC4_CB_##call,                                \
+        .p_statidx = NFSPROC4_CB_##call,                                \
-        .p_name   = #proc,                                              \
+        .p_name    = #proc,                                             \
-}
+}
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
+        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
 };
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
 * Note on the callback rpc program version number: despite language in rfc
 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
 * in practice that appears to be what implementations use.  The section
 * 18.36.3 language is expected to be fixed in an erratum.
 */
-        .number                 = 1,
+        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
+        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *     nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
 static struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
-                .program        = &cb_program
+        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-                .name           = "nfs4_cb",
+        .name                   = "nfs4_cb",
-                .number         = NFS4_CALLBACK,
+        .number                 = NFS4_CALLBACK,
-                .nrvers         = ARRAY_SIZE(nfs_cb_version),
+        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
-                .version        = nfs_cb_version,
+        .version                = nfs_cb_version,
-                .stats          = &cb_stats,
+        .stats                  = &cb_stats,
-                .pipe_dir_name  = "/nfsd4_cb",
+        .pipe_dir_name          = "/nfsd4_cb",
 };
 static int max_cb_time(void)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 116cab970e0..fbd18c3074b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4336,7 +4336,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-        cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+        cancel_delayed_work_sync(&laundromat_work);
        destroy_workqueue(laundry_wq);
        locks_end_grace(&nfsd4_manager);
        nfs4_lock_state();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff0..3a359023c9f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1756,8 +1756,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                goto out_dput_new;
        if (svc_msnfs(ffhp) &&
-                ((atomic_read(&odentry->d_count) > 1)
+                ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
-                 || (atomic_read(&ndentry->d_count) > 1))) {
                        host_err = -EPERM;
                        goto out_dput_new;
        }
@@ -1843,7 +1842,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (type != S_IFDIR) { /* It's UNLINK */
 #ifdef MSNFS
                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                        (atomic_read(&rdentry->d_count) > 1)) {
+                        (rdentry->d_count > 1)) {
                        host_err = -EPERM;
                } else
 #endif
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062ba..3ee67c67cc5 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+                                     const char *fname, int err)
+{
+        struct inode *inode = bmap->b_inode;
+        if (err == -EINVAL) {
+                nilfs_error(inode->i_sb, fname,
+                            "broken bmap (inode number=%lu)\n", inode->i_ino);
+                err = -EIO;
+        }
+        return err;
 }
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-        if (ret < 0)
+        if (ret < 0) {
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
                goto out;
+        }
        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-        if (!ret)
-                *key = lastkey;
        up_read(&bmap->b_sem);
+        if (ret < 0)
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+        else
+                *key = lastkey;
        return ret;
 }
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb74..388e9e8f528 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat =
+                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
-                                nilfs_dat_inode(NILFS_I_NILFS(inode));
                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f..9d45773b79e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
-        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
        loff_t pos = page_offset(page) + from;
        unsigned len = to - from;
        unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
                i_size_write(dir, pos + copied);
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
-        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        err = nilfs_set_file_dirty(dir, nr_dirty);
        WARN_ON(err); /* do not happen */
        unlock_page(page);
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6f..2f560c9fb80 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 /* end of file */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f9..bfc73d3a30e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-        if (unlikely(err)) {
+        if (unlikely(err))
-                if (err == -EINVAL)
+                nilfs_warning(sb, __func__, "unable to read inode: %lu",
-                        nilfs_error(sb, __func__, "ifile is broken");
+                              (unsigned long) ino);
-                else
-                        nilfs_warning(sb, __func__,
-                                      "unable to read inode: %lu",
-                                      (unsigned long) ino);
-        }
        return err;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e..2fd440d8d6b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
        struct nilfs_inode_info *ii = NILFS_I(inode);
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
        down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                                       inode->i_ino,
                                       (unsigned long long)blkoff);
                                err = 0;
-                        } else if (err == -EINVAL) {
-                                nilfs_error(inode->i_sb, __func__,
-                                            "broken bmap (inode=%lu)\n",
-                                            inode->i_ino);
-                                err = -EIO;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
                                                      to proper value */
        } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
        if (ret) {
                struct inode *inode = page->mapping->host;
-                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
-                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
                                                  start + copied);
        copied = generic_write_end(file, mapping, pos, len, copied, page,
                                   fsdata);
-        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
 }
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
-        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        }
        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        return 0;
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        brelse(bh);
 bad_inode:
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
 }
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
- repeat:
+repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;
- failed:
+failed:
-        if (ret == -EINVAL)
+        nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                      "failed to truncate bmap (ino=%lu, err=%d)",
-                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+                      ii->vfs_inode.i_ino, ret);
-        else
-                nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                              "failed to truncate bmap (ino=%lu, err=%d)",
-                              ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_mark_inode_dirty(inode);
-        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
           But truncate has no return value. */
@@ -785,20 +775,24 @@ out_err:
        return err;
 }
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        struct nilfs_root *root;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        root = NILFS_I(inode)->i_root;
        if ((mask & MAY_WRITE) && root &&
            root->cno != NILFS_CPTREE_CURRENT_CNO)
                return -EROFS; /* snapshot is not writable */
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
-                           struct buffer_head **pbh)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
@@ -839,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
        return ret;
 }
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
-                         unsigned nr_dirty)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -874,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct buffer_head *ibh;
        int err;
-        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warning(inode->i_sb, __func__,
                              "failed to reget inode block.\n");
@@ -920,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len)
+{
+        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        __u64 logical = 0, phys = 0, size = 0;
+        __u32 flags = 0;
+        loff_t isize;
+        sector_t blkoff, end_blkoff;
+        sector_t delalloc_blkoff;
+        unsigned long delalloc_blklen;
+        unsigned int blkbits = inode->i_blkbits;
+        int ret, n;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        blkoff = start >> blkbits;
+        end_blkoff = (start + len - 1) >> blkbits;
+        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+                                                        &delalloc_blkoff);
+        do {
+                __u64 blkphy;
+                unsigned int maxblocks;
+                if (delalloc_blklen && blkoff == delalloc_blkoff) {
+                        if (size) {
+                                /* End of the current extent */
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                        }
+                        if (blkoff > end_blkoff)
+                                break;
+                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+                        logical = blkoff << blkbits;
+                        phys = 0;
+                        size = delalloc_blklen << blkbits;
+                        blkoff = delalloc_blkoff + delalloc_blklen;
+                        delalloc_blklen = nilfs_find_uncommitted_extent(
+                                inode, blkoff, &delalloc_blkoff);
+                        continue;
+                }
+                /*
+                 * Limit the number of blocks that we look up so as
+                 * not to get into the next delayed allocation extent.
+                 */
+                maxblocks = INT_MAX;
+                if (delalloc_blklen)
+                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+                                          maxblocks);
+                blkphy = 0;
+                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                n = nilfs_bmap_lookup_contig(
+                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                if (n < 0) {
+                        int past_eof;
+                        if (unlikely(n != -ENOENT))
+                                break; /* error */
+                        /* HOLE */
+                        blkoff++;
+                        past_eof = ((blkoff << blkbits) >= isize);
+                        if (size) {
+                                /* End of the current extent */
+                                if (past_eof)
+                                        flags |= FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                                size = 0;
+                        }
+                        if (blkoff > end_blkoff || past_eof)
+                                break;
+                } else {
+                        if (size) {
+                                if (phys && blkphy << blkbits == phys + size) {
+                                        /* The current extent goes on */
+                                        size += n << blkbits;
+                                } else {
+                                        /* Terminate the current extent */
+                                        ret = fiemap_fill_next_extent(
+                                                fieinfo, logical, phys, size,
+                                                flags);
+                                        if (ret || blkoff > end_blkoff)
+                                                break;
+                                        /* Start another extent */
+                                        flags = FIEMAP_EXTENT_MERGED;
+                                        logical = blkoff << blkbits;
+                                        phys = blkphy << blkbits;
+                                        size = n << blkbits;
+                                }
+                        } else {
+                                /* Start a new extent */
+                                flags = FIEMAP_EXTENT_MERGED;
+                                logical = blkoff << blkbits;
+                                phys = blkphy << blkbits;
+                                size = n << blkbits;
+                        }
+                        blkoff += n;
+                }
+                cond_resched();
+        } while (true);
+        /* If ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b185e937a33..496738963fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -242,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        size_t nmembs = argv->v_nmembs;
        int ret;
-        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
        return (ret < 0) ? ret : nmembs;
 }
@@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                        /* skip dead block */
                        continue;
                if (bdescs[i].bd_level == 0) {
-                        ret = nilfs_mdt_mark_block_dirty(dat,
+                        ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
                                                         bdescs[i].bd_offset);
                        if (ret < 0) {
                                WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9..6a0e2a189f6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 *
 * %-ENOENT - the specified block does not exist (hole block)
 *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
 * %-EROFS - Read only filesystem (for create mode)
 */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 * %-ENOMEM - Insufficient memory available.
 *
 * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 * %-EIO - I/O error
 *
 * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
        struct buffer_head *bh_frozen;
        struct page *page;
        int blkbits = inode->i_blkbits;
-        int ret = -ENOMEM;
        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
        if (!page)
-                return ret;
+                return -ENOMEM;
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << blkbits, 0);
        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
-        if (bh_frozen) {
-                if (!buffer_uptodate(bh_frozen))
+        if (!buffer_uptodate(bh_frozen))
-                        nilfs_copy_buffer(bh_frozen, bh);
+                nilfs_copy_buffer(bh_frozen, bh);
-                if (list_empty(&bh_frozen->b_assoc_buffers)) {
+        if (list_empty(&bh_frozen->b_assoc_buffers)) {
-                        list_add_tail(&bh_frozen->b_assoc_buffers,
+                list_add_tail(&bh_frozen->b_assoc_buffers,
-                                      &shadow->frozen_buffers);
+                              &shadow->frozen_buffers);
-                        set_buffer_nilfs_redirected(bh);
+                set_buffer_nilfs_redirected(bh);
-                } else {
+        } else {
-                        brelse(bh_frozen); /* already frozen */
+                brelse(bh_frozen); /* already frozen */
-                }
-                ret = 0;
        }
        unlock_page(page);
        page_cache_release(page);
-        return ret;
+        return 0;
 }
 struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf16..98034271cd0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
        .rename         = nilfs_rename,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a56..777e8fd0430 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-        return nilfs->ns_dat;
-}
 /*
 * function prototype
 */
@@ -256,14 +251,14 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
-                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-                                unsigned);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len);
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f..0c432416cfe 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        }
        return nc;
 }
- 
 void nilfs_mapping_init_once(struct address_space *mapping)
 {
        memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff)
+{
+        unsigned int i;
+        pgoff_t index;
+        unsigned int nblocks_in_page;
+        unsigned long length = 0;
+        sector_t b;
+        struct pagevec pvec;
+        struct page *page;
+        if (inode->i_mapping->nrpages == 0)
+                return 0;
+        index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+repeat:
+        pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+                                        pvec.pages);
+        if (pvec.nr == 0)
+                return length;
+        if (length > 0 && pvec.pages[0]->index > index)
+                goto out;
+        b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        i = 0;
+        do {
+                page = pvec.pages[i];
+                lock_page(page);
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (b < start_blk)
+                                        continue;
+                                if (buffer_delay(bh)) {
+                                        if (length == 0)
+                                                *blkoff = b;
+                                        length++;
+                                } else if (length > 0) {
+                                        goto out_locked;
+                                }
+                        } while (++b, bh = bh->b_this_page, bh != head);
+                } else {
+                        if (length > 0)
+                                goto out_locked;
+                        b += nblocks_in_page;
+                }
+                unlock_page(page);
+        } while (++i < pagevec_count(&pvec));
+        index = page->index + 1;
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+out_locked:
+        unlock_page(page);
+out:
+        pagevec_release(&pvec);
+        return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a203..622df27cd89 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff);
 #define NILFS_PAGE_BUG(page, m, a...) \
        do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da..3dfcd3b7d38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err))
                        goto failed_page;
-                err = nilfs_set_file_dirty(sbi, inode, 1);
+                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_page;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b98..7a17715f215 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-        unsigned long mount_opt;
-        __u64 snapshot_cno;
-};
 struct the_nilfs;
 struct nilfs_sc_info;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea3..55ebae5c7f3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
        return err;
 }
-static int nilfs_handle_bmap_error(int err, const char *fname,
-                                   struct inode *inode, struct super_block *sb)
-{
-        if (err == -EINVAL) {
-                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-                            inode->i_ino);
-                err = -EIO;
-        }
-        return err;
-}
 /*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
 {
-        int err;
+        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
-                return nilfs_handle_bmap_error(err, __func__, inode,
-                                               sci->sc_super);
-        return 0;
 }
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
-        if (ret || nilfs_doing_gc())
+        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
-                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                ret++;
-                        ret++;
        return ret;
 }
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+        nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+        nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
                                 NILFS_SR_DAT_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_DAT:
 dat_stage:
-                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        return 0;
 failed_bmap:
-        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
        return err;
 }
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
                                if (!err) {
                                        set_buffer_uptodate(bh);
                                        clear_buffer_dirty(bh);
+                                        clear_buffer_delay(bh);
                                        clear_buffer_nilfs_volatile(bh);
                                }
                                brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
+                        clear_buffer_delay(bh);
                        clear_buffer_nilfs_volatile(bh);
                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d..70dfdd532b8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -162,10 +170,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        return &ii->vfs_inode;
 }
-void nilfs_destroy_inode(struct inode *inode)
+static void nilfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        if (mdi) {
                kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
                kfree(mdi);
@@ -173,6 +184,11 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
+void nilfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
@@ -838,7 +854,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-        return atomic_read(&root_dentry->d_count) > 1;
+        return root_dentry->d_count > 1;
 }
 /**
@@ -1002,11 +1018,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
-        struct nilfs_mount_options old_opts;
+        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_opts.mount_opt = sbi->s_mount_opt;
+        old_mount_opt = sbi->s_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1075,7 +1091,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_mount_opt = old_mount_opt;
        return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c..ad4ac607cf5 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        printk(KERN_INFO "NILFS: recovery complete.\n");
 skip_recovery:
-        set_nilfs_loaded(nilfs);
        nilfs_clear_recovery_info(&ri);
        sbi->s_super->s_flags = s_flags;
        return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b74..fd85e4c05c6 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
 /* the_nilfs struct */
 enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
-        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 20dc218707c..79b47cbb5cd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,19 +68,21 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
+                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
                        if (!child->d_inode)
                                continue;
-                        spin_lock(&child->d_lock);
+                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
+                spin_unlock(&alias->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be99254..4ff028fcfd6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
             unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a..f4b1057abdd 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * when atomic and when not atomic.  This is ok because
+ * atomic and when not atomic.  This is ok because it calls
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * and it is ok to call this when non-atomic.
+ * fact, the only difference between __copy_from_user_inatomic() and
- * Infact, the only difference between __copy_from_user_inatomic() and
 * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
+ * should not zero the tail of the buffer on error.  And on many architectures
- * architectures __copy_from_user_inatomic() is just defined to
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * makes no difference at all on those architectures.
 */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        *iov, *iov_ofs, len);
+                                        ofs, *iov, *iov_ofs, len);
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(addr + ofs + copied, 0, len - copied);
-                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
+                        kunmap(*pages);
                }
                total += len;
+                ntfs_set_next_iovec(iov, iov_ofs, len);
                bytes -= len;
                if (!bytes)
                        break;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
                ofs = 0;
        } while (++pages < last_page);
 out:
        return total;
 err_out:
-        total += copied;
+        BUG_ON(copied > len);
        /* Zero the rest of the target like __copy_from_user(). */
+        memset(addr + ofs + copied, 0, len - copied);
+        kunmap(*pages);
+        total += copied;
+        ntfs_set_next_iovec(iov, iov_ofs, copied);
        while (++pages < last_page) {
                bytes -= len;
                if (!bytes)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc..a627ed82c0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
        return NULL;
 }
+static void ntfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
 void ntfs_destroy_big_inode(struct inode *inode)
 {
        ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
        BUG_ON(ni->page);
        if (!atomic_dec_and_test(&ni->count))
                BUG();
-        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+        call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f..29099a07b9f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2001,2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
        ntfs_sysctl(0);
 }
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698..ab152c00cd3 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -51,7 +51,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 config OCFS2_FS_STATS
        bool "OCFS2 statistics"
-        depends on OCFS2_FS
+        depends on OCFS2_FS && DEBUG_FS
        default y
        help
          This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe..704f6b1742f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
        return ret;
 }
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
        int ret = -EAGAIN;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return ret;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f0585..4fe7c9cf4bf 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d..e4984e259cb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, tl_bh);
+        osb->truncated_clusters += num_clusters;
 bail:
        mlog_exit(status);
        return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                i--;
        }
+        osb->truncated_clusters = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc)
-{
-        int status;
-        unsigned int new_i_clusters;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct buffer_head *last_eb_bh = NULL;
-        mlog_entry_void();
-        *tc = NULL;
-        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-                                                  i_size_read(inode));
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)le64_to_cpu(fe->i_size));
-        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-        if (!(*tc)) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_extent_block(INODE_CACHE(inode),
-                                                 le64_to_cpu(fe->i_last_eb_blk),
-                                                 &last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        (*tc)->tc_last_eb_bh = last_eb_bh;
-        status = 0;
-bail:
-        if (status < 0) {
-                if (*tc)
-                        ocfs2_free_truncate_context(*tc);
-                *tc = NULL;
-        }
-        mlog_exit_void();
-        return status;
-}
-/*
 * 'start' is inclusive, 'end' is not.
 */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-        /*
-         * The caller is responsible for completing deallocation
-         * before freeing the context.
-         */
-        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                mlog(ML_NOTICE,
-                     "Truncate completion has non-empty dealloc context\n");
-        brelse(tc->tc_last_eb_bh);
-        kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b9..3bd08a03251 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
                          struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..1fbb0e20131 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (ocfs2_iocb_is_sem_locked(iocb)) {
+                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
-        if (!level)
-                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
        if (is_async)
@@ -1627,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+                                          unsigned int needed)
+{
+        tid_t target;
+        int ret = 0;
+        unsigned int truncated_clusters;
+        mutex_lock(&osb->osb_tl_inode->i_mutex);
+        truncated_clusters = osb->truncated_clusters;
+        mutex_unlock(&osb->osb_tl_inode->i_mutex);
+        /*
+         * Check whether we can succeed in allocating if we free
+         * the truncate log.
+         */
+        if (truncated_clusters < needed)
+                goto out;
+        ret = ocfs2_flush_truncate_log(osb);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+                jbd2_log_wait_commit(osb->journal->j_journal, target);
+                ret = 1;
+        }
+out:
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct file *filp,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
@@ -1634,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-        unsigned int clusters_to_alloc, extents_to_split;
+        unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1643,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int try_free = 1, ret1;
+try_again:
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1678,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
+                clusters_need = wc->w_clen;
                ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
@@ -1692,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        }
+        clusters_need += clusters_to_alloc;
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1814,6 +1858,22 @@ out:
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ret == -ENOSPC && try_free) {
+                /*
+                 * Try to free some truncate log so that we can have enough
+                 * clusters to allocate.
+                 */
+                try_free = 0;
+                ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+                if (ret1 == 1)
+                        goto try_again;
+                if (ret1 < 0)
+                        mlog_errno(ret1);
+        }
        return ret;
 }
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691..eceb456037c 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
        else
                clear_bit(1, (unsigned long *)&iocb->private);
 }
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+        OCFS2_IOCB_RW_LOCK = 0,
+        OCFS2_IOCB_RW_LOCK_LEVEL,
+        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_NUM_LOCKS
+};
 #define ocfs2_iocb_clear_rw_locked(iocb) \
-        clear_bit(0, (unsigned long *)&iocb->private)
+        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
-        test_bit(1, (unsigned long *)&iocb->private)
+        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9f26ac9be2a..a6cc05302e9 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES   4
 #define O2HB_DB_TYPE_REGION_NUMBER      5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME        6
+#define O2HB_DB_TYPE_REGION_PINNED      7
 struct o2hb_debug_buf {
        int db_type;
        int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS        "failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER        "num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME  "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED        "pinned"
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF                3
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
 /* Only sets a new threshold if there are no active regions.
 *
 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
        struct list_head        hr_all_item;
-        unsigned                hr_unclean_stop:1;
+        unsigned                hr_unclean_stop:1,
+                                hr_item_pinned:1,
+                                hr_item_dropped:1;
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
        struct dentry           *hr_debug_livenodes;
        struct dentry           *hr_debug_regnum;
        struct dentry           *hr_debug_elapsed_time;
+        struct dentry           *hr_debug_pinned;
        struct o2hb_debug_buf   *hr_db_livenodes;
        struct o2hb_debug_buf   *hr_db_regnum;
        struct o2hb_debug_buf   *hr_db_elapsed_time;
+        struct o2hb_debug_buf   *hr_db_pinned;
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
@@ -307,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-        cancel_delayed_work(&reg->hr_write_timeout_work);
+        cancel_delayed_work_sync(&reg->hr_write_timeout_work);
-        flush_scheduled_work();
 }
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -702,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
               config_item_name(&reg->hr_item));
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        /*
+         * If global heartbeat active, unpin all regions if the
+         * region count > CUT_OFF
+         */
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+                o2hb_region_unpin(NULL);
 }
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1042,6 +1082,9 @@ static int o2hb_thread(void *data)
        set_user_nice(current, -20);
+        /* Pin node */
+        o2nm_depend_this_node();
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1091,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
+        /* Unpin node */
+        o2nm_undepend_this_node();
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
        return 0;
@@ -1143,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
                                                 reg->hr_last_timeout_start));
                goto done;
+        case O2HB_DB_TYPE_REGION_PINNED:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                !!reg->hr_item_pinned);
+                goto done;
        default:
                goto done;
        }
@@ -1316,6 +1368,8 @@ int o2hb_init(void)
        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+        o2hb_dependent_users = 0;
        return o2hb_debug_init();
 }
@@ -1385,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
+        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
        spin_lock(&o2hb_live_lock);
@@ -1949,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                goto bail;
        }
+        reg->hr_debug_pinned =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_pinned),
+                                          sizeof(*(reg->hr_db_pinned)),
+                                          O2HB_DB_TYPE_REGION_PINNED,
+                                          0, 0, reg);
+        if (!reg->hr_debug_pinned) {
+                mlog_errno(ret);
+                goto bail;
+        }
        ret = 0;
 bail:
        return ret;
@@ -2003,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+        int quorum_region = 0;
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
        if (o2hb_global_heartbeat_active()) {
                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+        reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -2029,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (o2hb_global_heartbeat_active())
                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
                       config_item_name(&reg->hr_item));
        config_item_put(item);
+        if (!o2hb_global_heartbeat_active() || !quorum_region)
+                return;
+        /*
+         * If global heartbeat active and there are dependent users,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        spin_lock(&o2hb_live_lock);
+        if (!o2hb_dependent_users)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 struct o2hb_heartbeat_group_attribute {
@@ -2215,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-        struct o2hb_region *p, *reg = NULL;
+        int ret = 0, found = 0;
+        struct o2hb_region *reg;
+        char *uuid;
        assert_spin_locked(&o2hb_live_lock);
-        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                uuid = config_item_name(&reg->hr_item);
-                        reg = p;
-                        break;
+                /* local heartbeat */
+                if (region_uuid) {
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
+                if (reg->hr_item_pinned || reg->hr_item_dropped)
+                        goto skip_pin;
+                /* Ignore ENOENT only for local hb (userdlm domain) */
+                ret = o2nm_depend_item(&reg->hr_item);
+                if (!ret) {
+                        mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                        reg->hr_item_pinned = 1;
+                } else {
+                        if (ret == -ENOENT && found)
+                                ret = 0;
+                        else {
+                                mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                     uuid, ret);
+                                break;
+                        }
                }
+skip_pin:
+                if (found)
+                        break;
        }
-        return reg;
+        return ret;
 }
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-        int ret = 0;
        struct o2hb_region *reg;
+        char *uuid;
+        int found = 0;
-        spin_lock(&o2hb_live_lock);
+        assert_spin_locked(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-        if (!reg)
+                uuid = config_item_name(&reg->hr_item);
-                ret = -ENOENT;
+                if (region_uuid) {
-        spin_unlock(&o2hb_live_lock);
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
-        if (ret)
+                if (reg->hr_item_pinned) {
-                goto out;
+                        mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                        o2nm_undepend_item(&reg->hr_item);
+                        reg->hr_item_pinned = 0;
+                }
+                if (found)
+                        break;
+        }
+}
-        ret = o2nm_depend_this_node();
+static int o2hb_region_inc_user(const char *region_uuid)
-        if (ret)
+{
-                goto out;
+        int ret = 0;
-        ret = o2nm_depend_item(&reg->hr_item);
+        spin_lock(&o2hb_live_lock);
-        if (ret)
-                o2nm_undepend_this_node();
-out:
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            ret = o2hb_region_pin(region_uuid);
+            goto unlock;
+        }
+        /*
+         * if global heartbeat active and this is the first dependent user,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        o2hb_dependent_users++;
+        if (o2hb_dependent_users > 1)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                ret = o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
        return ret;
 }
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-        struct o2hb_region *reg;
        spin_lock(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            o2hb_region_unpin(region_uuid);
+            goto unlock;
+        }
-        spin_unlock(&o2hb_live_lock);
+        /*
+         * if global heartbeat active and there are no dependent users,
+         * unpin all quorum regions
+         */
+        o2hb_dependent_users--;
+        if (!o2hb_dependent_users)
+                o2hb_region_unpin(NULL);
-        if (reg) {
+unlock:
-                o2nm_undepend_item(&reg->hr_item);
+        spin_unlock(&o2hb_live_lock);
-                o2nm_undepend_this_node();
-        }
 }
 int o2hb_register_callback(const char *region_uuid,
@@ -2292,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
        if (region_uuid) {
-                ret = o2hb_region_get(region_uuid);
+                ret = o2hb_region_inc_user(region_uuid);
-                if (ret)
+                if (ret) {
+                        mlog_errno(ret);
                        goto out;
+                }
        }
        down_write(&o2hb_callback_sem);
@@ -2312,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-        mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -2323,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
-        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
        /* XXX Can this happen _with_ a region reference? */
@@ -2331,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
        if (region_uuid)
-                o2hb_region_put(region_uuid);
+                o2hb_region_dec_user(region_uuid);
        down_write(&o2hb_callback_sem);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392..6c61771469a 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUOTA),
        define_mask(REFCOUNT),
        define_mask(BASTS),
+        define_mask(RESERVATIONS),
+        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
-        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c9..34d6544357d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
 #include <linux/sched.h>
 /* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
 #define ML_ENTRY        0x0000000000000001ULL /* func call entry */
 #define ML_EXIT         0x0000000000000002ULL /* func call exit */
 #define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_NOTICE       0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER      0x0000001000000000ULL /* cluster stack */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b0..3a5835904b3 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR         "o2net"
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
+#define STATS_DEBUG_NAME        "stats"
+#define SHOW_SOCK_CONTAINERS    0
+#define SHOW_SOCK_STATS         1
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        ktime_t now;
+        s64 sock, send, status;
        spin_lock(&o2net_debug_lock);
        nst = next_nst(dummy_nst);
+        if (!nst)
+                goto out;
-        if (nst != NULL) {
+        now = ktime_get();
-                /* get_task_comm isn't exported.  oh well. */
+        sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-                seq_printf(seq, "%p:\n"
+        send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-                           "  pid:          %lu\n"
+        status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-                           "  tgid:         %lu\n"
-                           "  process name: %s\n"
+        /* get_task_comm isn't exported.  oh well. */
-                           "  node:         %u\n"
+        seq_printf(seq, "%p:\n"
-                           "  sc:           %p\n"
+                   "  pid:          %lu\n"
-                           "  message id:   %d\n"
+                   "  tgid:         %lu\n"
-                           "  message type: %u\n"
+                   "  process name: %s\n"
-                           "  message key:  0x%08x\n"
+                   "  node:         %u\n"
-                           "  sock acquiry: %lu.%ld\n"
+                   "  sc:           %p\n"
-                           "  send start:   %lu.%ld\n"
+                   "  message id:   %d\n"
-                           "  wait start:   %lu.%ld\n",
+                   "  message type: %u\n"
-                           nst, (unsigned long)nst->st_task->pid,
+                   "  message key:  0x%08x\n"
-                           (unsigned long)nst->st_task->tgid,
+                   "  sock acquiry: %lld usecs ago\n"
-                           nst->st_task->comm, nst->st_node,
+                   "  send start:   %lld usecs ago\n"
-                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                   "  wait start:   %lld usecs ago\n",
-                           nst->st_msg_key,
+                   nst, (unsigned long)task_pid_nr(nst->st_task),
-                           nst->st_sock_time.tv_sec,
+                   (unsigned long)nst->st_task->tgid,
-                           (long)nst->st_sock_time.tv_usec,
+                   nst->st_task->comm, nst->st_node,
-                           nst->st_send_time.tv_sec,
+                   nst->st_sc, nst->st_id, nst->st_msg_type,
-                           (long)nst->st_send_time.tv_usec,
+                   nst->st_msg_key,
-                           nst->st_status_time.tv_sec,
+                   (long long)sock,
-                           (long)nst->st_status_time.tv_usec);
+                   (long long)send,
-        }
+                   (long long)status);
+out:
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
        spin_unlock(&o2net_debug_lock);
 }
+struct o2net_sock_debug {
+        int dbg_ctxt;
+        struct o2net_sock_container *dbg_sock;
+};
 static struct o2net_sock_container
                        *next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)              ((_s)->sc_send_count)
+# define sc_recv_count(_s)              ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)        (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)      (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)              (0U)
+# define sc_recv_count(_s)              (0U)
+# define sc_tv_acquiry_total_ns(_s)     (0LL)
+# define sc_tv_send_total_ns(_s)        (0LL)
+# define sc_tv_status_total_ns(_s)      (0LL)
+# define sc_tv_process_total_ns(_s)     (0LL)
+#endif
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION         1
+static void sc_show_sock_stats(struct seq_file *seq,
+                               struct o2net_sock_container *sc)
+{
+        if (!sc)
+                return;
+        seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+                   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+                   (long long)sc_tv_acquiry_total_ns(sc),
+                   (long long)sc_tv_send_total_ns(sc),
+                   (long long)sc_tv_status_total_ns(sc),
+                   (unsigned long)sc_recv_count(sc),
+                   (long long)sc_tv_process_total_ns(sc));
+}
+static void sc_show_sock_container(struct seq_file *seq,
+                                   struct o2net_sock_container *sc)
+{
+        struct inet_sock *inet = NULL;
+        __be32 saddr = 0, daddr = 0;
+        __be16 sport = 0, dport = 0;
+        if (!sc)
+                return;
+        if (sc->sc_sock) {
+                inet = inet_sk(sc->sc_sock->sk);
+                /* the stack's structs aren't sparse endian clean */
+                saddr = (__force __be32)inet->inet_saddr;
+                daddr = (__force __be32)inet->inet_daddr;
+                sport = (__force __be16)inet->inet_sport;
+                dport = (__force __be16)inet->inet_dport;
+        }
+        /* XXX sigh, inet-> doesn't have sparse annotation so any
+         * use of it here generates a warning with -Wbitwise */
+        seq_printf(seq, "%p:\n"
+                   "  krefs:           %d\n"
+                   "  sock:            %pI4:%u -> "
+                                      "%pI4:%u\n"
+                   "  remote node:     %s\n"
+                   "  page off:        %zu\n"
+                   "  handshake ok:    %u\n"
+                   "  timer:           %lld usecs\n"
+                   "  data ready:      %lld usecs\n"
+                   "  advance start:   %lld usecs\n"
+                   "  advance stop:    %lld usecs\n"
+                   "  func start:      %lld usecs\n"
+                   "  func stop:       %lld usecs\n"
+                   "  func key:        0x%08x\n"
+                   "  func type:       %u\n",
+                   sc,
+                   atomic_read(&sc->sc_kref.refcount),
+                   &saddr, inet ? ntohs(sport) : 0,
+                   &daddr, inet ? ntohs(dport) : 0,
+                   sc->sc_node->nd_name,
+                   sc->sc_page_off,
+                   sc->sc_handshake_ok,
+                   (long long)ktime_to_us(sc->sc_tv_timer),
+                   (long long)ktime_to_us(sc->sc_tv_data_ready),
+                   (long long)ktime_to_us(sc->sc_tv_advance_start),
+                   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+                   (long long)ktime_to_us(sc->sc_tv_func_start),
+                   (long long)ktime_to_us(sc->sc_tv_func_stop),
+                   sc->sc_msg_key,
+                   sc->sc_msg_type);
+}
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
-        if (sc != NULL) {
+        if (sc) {
-                struct inet_sock *inet = NULL;
+                if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+                        sc_show_sock_container(seq, sc);
-                __be32 saddr = 0, daddr = 0;
+                else
-                __be16 sport = 0, dport = 0;
+                        sc_show_sock_stats(seq, sc);
-                if (sc->sc_sock) {
-                        inet = inet_sk(sc->sc_sock->sk);
-                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->inet_dport;
-                }
-                /* XXX sigh, inet-> doesn't have sparse annotation so any
-                 * use of it here generates a warning with -Wbitwise */
-                seq_printf(seq, "%p:\n"
-                           "  krefs:           %d\n"
-                           "  sock:            %pI4:%u -> "
-                                              "%pI4:%u\n"
-                           "  remote node:     %s\n"
-                           "  page off:        %zu\n"
-                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%ld\n"
-                           "  func key:        %u\n"
-                           "  func type:       %u\n",
-                           sc,
-                           atomic_read(&sc->sc_kref.refcount),
-                           &saddr, inet ? ntohs(sport) : 0,
-                           &daddr, inet ? ntohs(dport) : 0,
-                           sc->sc_node->nd_name,
-                           sc->sc_page_off,
-                           sc->sc_handshake_ok,
-                           TV_SEC_USEC(sc->sc_tv_timer),
-                           TV_SEC_USEC(sc->sc_tv_data_ready),
-                           TV_SEC_USEC(sc->sc_tv_advance_start),
-                           TV_SEC_USEC(sc->sc_tv_advance_stop),
-                           TV_SEC_USEC(sc->sc_tv_func_start),
-                           TV_SEC_USEC(sc->sc_tv_func_stop),
-                           sc->sc_msg_key,
-                           sc->sc_msg_type);
        }
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
        struct o2net_sock_container *dummy_sc;
        struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
                goto out;
        seq = file->private_data;
-        seq->private = dummy_sc;
+        seq->private = sd;
+        sd->dbg_sock = dummy_sc;
        o2net_debug_add_sc(dummy_sc);
        dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
-        struct o2net_sock_container *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *dummy_sc = sd->dbg_sock;
        o2net_debug_del_sc(dummy_sc);
        return seq_release_private(inode, file);
 }
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_STATS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
+static const struct file_operations stats_seq_fops = {
+        .open = stats_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
 static const struct file_operations sc_seq_fops = {
        .open = sc_fop_open,
        .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
                goto bail;
        }
+        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                           o2net_dentry, NULL,
+                                           &stats_seq_fops);
+        if (!stats_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
        return 0;
 bail:
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
        return -ENOMEM;
 }
 void o2net_debugfs_exit(void)
 {
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e1669621..a87366750f2 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
 void o2quo_exit(void)
 {
-        flush_scheduled_work();
+        struct o2quo_state *qs = &o2quo_state;
+        flush_work_sync(&qs->qs_work);
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e4212..3b11cb1e38f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_sock_time);
+        nst->st_sock_time = ktime_get();
 }
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_send_time);
+        nst->st_send_time = ktime_get();
 }
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_status_time);
+        nst->st_status_time = ktime_get();
 }
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                         struct o2net_sock_container *sc)
+                                                struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
 {
        nst->st_id = msg_id;
 }
-#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
 {
+        sc->sc_tv_timer = ktime_get();
 }
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_data_ready = ktime_get();
 }
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_start = ktime_get();
 }
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_stop = ktime_get();
 }
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
-                                                struct o2net_sock_container *sc)
 {
+        sc->sc_tv_func_start = ktime_get();
 }
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
-                                        u32 msg_id)
 {
+        sc->sc_tv_func_stop = ktime_get();
 }
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+                                    struct o2net_sock_container *sc)
+{
+        sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+                                           ktime_sub(ktime_get(),
+                                                     nst->st_status_time));
+        sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+                                         ktime_sub(nst->st_status_time,
+                                                   nst->st_send_time));
+        sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+                                            ktime_sub(nst->st_send_time,
+                                                      nst->st_sock_time));
+        sc->sc_send_count++;
+}
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+        sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+                                            o2net_get_func_run_time(sc));
+        sc->sc_recv_count++;
+}
+#else
+# define o2net_update_send_stats(a, b)
+# define o2net_update_recv_stats(sc)
+#endif /* CONFIG_OCFS2_FS_STATS */
 static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
                sc->sc_sock = NULL;
        }
+        o2nm_undepend_item(&sc->sc_node->nd_item);
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
        struct o2net_sock_container *sc, *ret = NULL;
        struct page *page = NULL;
+        int status = 0;
        page = alloc_page(GFP_NOFS);
        sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        o2nm_node_get(node);
        sc->sc_node = node;
+        /* pin the node item of the remote node */
+        status = o2nm_depend_item(&node->nd_item);
+        if (status) {
+                mlog_errno(status);
+                o2nm_node_put(node);
+                goto out;
+        }
        INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
        INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        if (sk->sk_user_data) {
                struct o2net_sock_container *sc = sk->sk_user_data;
                sclog(sc, "data_ready hit\n");
-                do_gettimeofday(&sc->sc_tv_data_ready);
+                o2net_set_data_ready_time(sc);
                o2net_sc_queue_work(sc, &sc->sc_rx_work);
                ready = sc->sc_data_ready;
        } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+        o2net_update_send_stats(&nst, sc);
        /* Note that we avoid overwriting the callers status return
         * variable if a system error was reported on the other
         * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        if (syserr != O2NET_ERR_NONE)
                goto out_respond;
-        do_gettimeofday(&sc->sc_tv_func_start);
+        o2net_set_func_start_time(sc);
        sc->sc_msg_key = be32_to_cpu(hdr->key);
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
                                        nmh->nh_func_data, &ret_data);
-        do_gettimeofday(&sc->sc_tv_func_stop);
+        o2net_set_func_stop_time(sc);
+        o2net_update_recv_stats(sc);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
        size_t datalen;
        sclog(sc, "receiving\n");
-        do_gettimeofday(&sc->sc_tv_advance_start);
+        o2net_set_advance_start_time(sc);
        if (unlikely(sc->sc_handshake_ok == 0)) {
                if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 out:
        sclog(sc, "ret = %d\n", ret);
-        do_gettimeofday(&sc->sc_tv_advance_stop);
+        o2net_set_advance_stop_time(sc);
        return ret;
 }
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-        struct timeval now;
-        do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+        ktime_t now = ktime_get();
+#endif
        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
-        mlog(ML_NOTICE, "here are some times that might help debug the "
-             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+#ifdef CONFIG_DEBUG_FS
-             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             now.tv_sec, (long) now.tv_usec,
+             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             sc->sc_tv_advance_start.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long) sc->sc_tv_advance_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             sc->sc_tv_advance_stop.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             (long) sc->sc_tv_advance_stop.tv_usec,
             sc->sc_msg_key, sc->sc_msg_type,
-             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_func_start),
-             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+             (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
                      msecs_to_jiffies(o2net_keepalive_delay()));
-        do_gettimeofday(&sc->sc_tv_timer);
+        o2net_set_sock_timer(sc);
        mod_timer(&sc->sc_idle_timeout,
               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4..4cbcb65784a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-        struct list_head        sc_net_debug_item;
-#endif
-        struct timeval          sc_tv_timer;
-        struct timeval          sc_tv_data_ready;
-        struct timeval          sc_tv_advance_start;
-        struct timeval          sc_tv_advance_stop;
-        struct timeval          sc_tv_func_start;
-        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+        ktime_t                 sc_tv_timer;
+        ktime_t                 sc_tv_data_ready;
+        ktime_t                 sc_tv_advance_start;
+        ktime_t                 sc_tv_advance_stop;
+        ktime_t                 sc_tv_func_start;
+        ktime_t                 sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+        ktime_t                 sc_tv_acquiry_total;
+        ktime_t                 sc_tv_send_total;
+        ktime_t                 sc_tv_status_total;
+        u32                     sc_send_count;
+        u32                     sc_recv_count;
+        ktime_t                 sc_tv_process_total;
+#endif
        struct mutex            sc_send_lock;
 };
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
        u32                             st_msg_type;
        u32                             st_msg_key;
        u8                              st_node;
-        struct timeval                  st_sock_time;
+        ktime_t                         st_sock_time;
-        struct timeval                  st_send_time;
+        ktime_t                         st_send_time;
-        struct timeval                  st_status_time;
+        ktime_t                         st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 895532ac4d9..6d80ecc7834 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int ret = 0;    /* if all else fails, just return false */
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        struct ocfs2_super *osb;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        osb = OCFS2_SB(dentry->d_sb);
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -169,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
        struct list_head *p;
        struct dentry *dentry = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each(p, &inode->i_dentry) {
                dentry = list_entry(p, struct dentry, d_alias);
+                spin_lock(&dentry->d_lock);
                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
                        mlog(0, "dentry found: %.*s\n",
                             dentry->d_name.len, dentry->d_name.name);
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
                        break;
                }
+                spin_unlock(&dentry->d_lock);
                dentry = NULL;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return dentry;
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7a..d417b3f9b0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        di->i_dx_root = cpu_to_le64(dr_blkno);
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                goto out_commit;
        }
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f4499915683..3a3ed4bb794 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
+        res = lock->lockres;
        assert_spin_locked(&dlm->ast_lock);
        if (!list_empty(&lock->ast_list)) {
-                mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+                mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+                     "AST list not empty, pending %d, newlevel %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
        if (lock->ast_pending)
-                mlog(0, "lock has an ast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        /* check to see if this ast obsoletes the bast */
        if (dlm_should_cancel_bast(dlm, lock)) {
-                struct dlm_lock_resource *res = lock->lockres;
+                mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-                mlog(0, "%s: cancelling bast for %.*s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
-                     dlm->name, res->lockname.len, res->lockname.name);
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
                lock->bast_pending = 0;
                list_del_init(&lock->bast_list);
                lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
        assert_spin_locked(&dlm->ast_lock);
+        res = lock->lockres;
        BUG_ON(!list_empty(&lock->bast_list));
        if (lock->bast_pending)
-                mlog(0, "lock has a bast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        dlm_astlockfunc_t *fn;
        struct dlm_lockstatus *lksb;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct dlm_lockstatus *lksb;
        int lksbflags;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
        dlm_bastlockfunc_t *fn = lock->bast;
-        mlog_entry_void();
        BUG_ON(lock->ml.node != dlm->node_num);
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             blocked_type);
        (*fn)(lock->astdata, blocked_type);
 }
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        /* cannot get a proxy ast message if this node owns it */
        BUG_ON(res->owner == dlm->node_num);
-        mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: Adding to granted list... type=%d, "
+                mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        size_t veclen = 1;
        int status;
-        mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+        mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-                   res->lockname.len, res->lockname.name, lock->ml.node,
+             res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-                   msg_type, blocked_type);
+             blocked_type);
        memset(&past, 0, sizeof(struct dlm_proxy_ast));
        past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        vec[0].iov_len = sizeof(struct dlm_proxy_ast);
        vec[0].iov_base = &past;
        if (flags & DLM_LKSB_GET_LVB) {
-                mlog(0, "returning requested LVB data\n");
                be32_add_cpu(&past.flags, LKM_GET_LVB);
                vec[1].iov_len = DLM_LVB_LEN;
                vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     dlm->name, res->lockname.len, res->lockname.name, ret,
                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5..4bdf7baee34 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_mle_type {
-        DLM_MLE_BLOCK,
+        DLM_MLE_BLOCK = 0,
-        DLM_MLE_MASTER,
+        DLM_MLE_MASTER = 1,
-        DLM_MLE_MIGRATION,
+        DLM_MLE_MIGRATION = 2,
-        DLM_MLE_NUM_TYPES
+        DLM_MLE_NUM_TYPES = 3,
 };
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 enum dlm_ast_type {
        DLM_AST = 0,
-        DLM_BAST,
+        DLM_BAST = 1,
-        DLM_ASTUNLOCK
+        DLM_ASTUNLOCK = 2,
 };
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 enum dlm_ctxt_state {
        DLM_CTXT_NEW = 0,
-        DLM_CTXT_JOINED,
+        DLM_CTXT_JOINED = 1,
-        DLM_CTXT_IN_SHUTDOWN,
+        DLM_CTXT_IN_SHUTDOWN = 2,
-        DLM_CTXT_LEAVING,
+        DLM_CTXT_LEAVING = 3,
 };
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 enum dlm_lockres_list {
        DLM_GRANTED_LIST = 0,
-        DLM_CONVERTING_LIST,
+        DLM_CONVERTING_LIST = 1,
-        DLM_BLOCKED_LIST
+        DLM_BLOCKED_LIST = 2,
 };
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 enum {
-        DLM_MASTER_REQUEST_MSG    = 500,
+        DLM_MASTER_REQUEST_MSG          = 500,
-        DLM_UNUSED_MSG1,         /* 501 */
+        DLM_UNUSED_MSG1                 = 501,
-        DLM_ASSERT_MASTER_MSG,   /* 502 */
+        DLM_ASSERT_MASTER_MSG           = 502,
-        DLM_CREATE_LOCK_MSG,     /* 503 */
+        DLM_CREATE_LOCK_MSG             = 503,
-        DLM_CONVERT_LOCK_MSG,    /* 504 */
+        DLM_CONVERT_LOCK_MSG            = 504,
-        DLM_PROXY_AST_MSG,       /* 505 */
+        DLM_PROXY_AST_MSG               = 505,
-        DLM_UNLOCK_LOCK_MSG,     /* 506 */
+        DLM_UNLOCK_LOCK_MSG             = 506,
-        DLM_DEREF_LOCKRES_MSG,   /* 507 */
+        DLM_DEREF_LOCKRES_MSG           = 507,
-        DLM_MIGRATE_REQUEST_MSG, /* 508 */
+        DLM_MIGRATE_REQUEST_MSG         = 508,
-        DLM_MIG_LOCKRES_MSG,     /* 509 */
+        DLM_MIG_LOCKRES_MSG             = 509,
-        DLM_QUERY_JOIN_MSG,      /* 510 */
+        DLM_QUERY_JOIN_MSG              = 510,
-        DLM_ASSERT_JOINED_MSG,   /* 511 */
+        DLM_ASSERT_JOINED_MSG           = 511,
-        DLM_CANCEL_JOIN_MSG,     /* 512 */
+        DLM_CANCEL_JOIN_MSG             = 512,
-        DLM_EXIT_DOMAIN_MSG,     /* 513 */
+        DLM_EXIT_DOMAIN_MSG             = 513,
-        DLM_MASTER_REQUERY_MSG,  /* 514 */
+        DLM_MASTER_REQUERY_MSG          = 514,
-        DLM_LOCK_REQUEST_MSG,    /* 515 */
+        DLM_LOCK_REQUEST_MSG            = 515,
-        DLM_RECO_DATA_DONE_MSG,  /* 516 */
+        DLM_RECO_DATA_DONE_MSG          = 516,
-        DLM_BEGIN_RECO_MSG,      /* 517 */
+        DLM_BEGIN_RECO_MSG              = 517,
-        DLM_FINALIZE_RECO_MSG,   /* 518 */
+        DLM_FINALIZE_RECO_MSG           = 518,
-        DLM_QUERY_REGION,        /* 519 */
+        DLM_QUERY_REGION                = 519,
-        DLM_QUERY_NODEINFO,      /* 520 */
+        DLM_QUERY_NODEINFO              = 520,
 };
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
        DLM_RECO_NODE_DATA_DEAD = -1,
        DLM_RECO_NODE_DATA_INIT = 0,
-        DLM_RECO_NODE_DATA_REQUESTING,
+        DLM_RECO_NODE_DATA_REQUESTING = 1,
-        DLM_RECO_NODE_DATA_REQUESTED,
+        DLM_RECO_NODE_DATA_REQUESTED = 2,
-        DLM_RECO_NODE_DATA_RECEIVING,
+        DLM_RECO_NODE_DATA_RECEIVING = 3,
-        DLM_RECO_NODE_DATA_DONE,
+        DLM_RECO_NODE_DATA_DONE = 4,
-        DLM_RECO_NODE_DATA_FINALIZE_SENT,
+        DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 enum {
        DLM_MASTER_RESP_NO = 0,
-        DLM_MASTER_RESP_YES,
+        DLM_MASTER_RESP_YES = 1,
-        DLM_MASTER_RESP_MAYBE,
+        DLM_MASTER_RESP_MAYBE = 2,
-        DLM_MASTER_RESP_ERROR
+        DLM_MASTER_RESP_ERROR = 3,
 };
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
        JOIN_DISALLOW = 0,
-        JOIN_OK,
+        JOIN_OK = 1,
-        JOIN_OK_NO_MAP,
+        JOIN_OK_NO_MAP = 2,
-        JOIN_PROTOCOL_MISMATCH,
+        JOIN_PROTOCOL_MISMATCH = 3,
 };
 struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a5..04a32be0aeb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
        kref_get(&dc->debug_refcnt);
 }
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = NULL;
+        free_page((unsigned long)file->private_data);
+        return 0;
-        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-        if (!db)
-                goto bail;
-        db->len = PAGE_SIZE;
-        db->buf = kmalloc(db->len, GFP_KERNEL);
-        if (!db->buf)
-                goto bail;
-        return db;
-bail:
-        kfree(db);
-        return NULL;
-}
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-                                 size_t nbytes, loff_t *ppos)
-{
-        struct debug_buffer *db = file->private_data;
-        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-        struct debug_buffer *db = file->private_data;
-        loff_t new = -1;
-        switch (whence) {
-        case 0:
-                new = off;
-                break;
-        case 1:
-                new = file->f_pos + off;
-                break;
-        }
-        if (new < 0 || new > db->len)
-                return -EINVAL;
-        return (file->f_pos = new);
 }
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+                          size_t nbytes, loff_t *ppos)
 {
-        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
-        if (db)
-                kfree(db->buf);
-        kfree(db);
-        return 0;
 }
 /* end - util funcs */
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_lock_resource *res;
        int out = 0;
        unsigned long total = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping Purgelist for Domain: %s\n", dlm->name);
        spin_lock(&dlm->spinlock);
        list_for_each_entry(res, &dlm->purge_list, purge) {
                ++total;
-                if (db->len - out < 100)
+                if (len - out < 100)
                        continue;
                spin_lock(&res->spinlock);
                out += stringify_lockname(res->lockname.name,
                                          res->lockname.len,
-                                          db->buf + out, db->len - out);
+                                          buf + out, len - out);
-                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                out += snprintf(buf + out, len - out, "\t%ld\n",
                                (jiffies - res->last_used)/HZ);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
-                        "Total on list: %ld\n", total);
        return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_purgelist_print(dlm, db);
+        i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -480,14 +434,14 @@ bail:
 static const struct file_operations debug_purgelist_fops = {
        .open =         debug_purgelist_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - purge list funcs */
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_master_list_entry *mle;
        struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int i, out = 0;
        unsigned long total = 0, longest = 0, bucket_count = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                          master_hash_node);
                        ++total;
                        ++bucket_count;
-                        if (db->len - out < 200)
+                        if (len - out < 200)
                                continue;
-                        out += dump_mle(mle, db->buf + out, db->len - out);
+                        out += dump_mle(mle, buf + out, len - out);
                }
                longest = max(longest, bucket_count);
                bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_mle_print(dlm, db);
+        i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 static const struct file_operations debug_mle_fops = {
        .open =         debug_mle_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        int out = 0;
        struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        }
        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
                        dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
                        dlm->dlm_locking_proto.pv_minor);
        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Thread Pid: %d  Node: %d  State: %s\n",
-                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+                        task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
        /* Number of Joins: xxx  Joining Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Number of Joins: %d  Joining Node: %d\n",
                        dlm->num_joins, dlm->joining_node);
        /* Domain Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += snprintf(buf + out, len - out, "Domain Map: ");
        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Lock Resources: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lock Resources: %d (%d)\n",
                        atomic_read(&dlm->res_cur_count),
                        atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
        /* MLEs: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
        /*  Blocking: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Blocking: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
        /*  Mastery: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Mastery: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
        /*  Migration: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Migration: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
                        atomic_read(&dlm->dlm_refs.refcount));
        /* Dead Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dead Node: %d\n", dlm->reco.dead_node);
        /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                state = "INACTIVE";
        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Recovery Pid: %d  Master: %d  State: %s\n",
-                        dlm->dlm_reco_thread_task->pid,
+                        task_pid_nr(dlm->dlm_reco_thread_task),
                        dlm->reco.new_master, state);
        /* Recovery Map: xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += snprintf(buf + out, len - out, "Recovery Map: ");
        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Recovery Node State: */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        out += snprintf(buf + out, len - out, "Recovery Node State:\n");
        list_for_each_entry(node, &dlm->reco.node_data, list) {
                switch (node->state) {
                case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        state = "BAD";
                        break;
                }
-                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                out += snprintf(buf + out, len - out, "\t%u - %s\n",
                                node->node_num, state);
        }
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db = NULL;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_state_print(dlm, db);
+        i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -936,9 +890,9 @@ bail:
 static const struct file_operations debug_state_fops = {
        .open =         debug_state_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end  - debug state funcs */
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
        if (dc) {
-                if (dc->debug_purgelist_dentry)
+                debugfs_remove(dc->debug_purgelist_dentry);
-                        debugfs_remove(dc->debug_purgelist_dentry);
+                debugfs_remove(dc->debug_mle_dentry);
-                if (dc->debug_mle_dentry)
+                debugfs_remove(dc->debug_lockres_dentry);
-                        debugfs_remove(dc->debug_mle_dentry);
+                debugfs_remove(dc->debug_state_dentry);
-                if (dc->debug_lockres_dentry)
-                        debugfs_remove(dc->debug_lockres_dentry);
-                if (dc->debug_state_dentry)
-                        debugfs_remove(dc->debug_state_dentry);
                dlm_debug_put(dc);
        }
 }
@@ -1040,8 +990,7 @@ bail:
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-        if (dlm->dlm_debugfs_subroot)
+        debugfs_remove(dlm->dlm_debugfs_subroot);
-                debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 /* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 void dlm_destroy_debugfs_root(void)
 {
-        if (dlm_debugfs_root)
+        debugfs_remove(dlm_debugfs_root);
-                debugfs_remove(dlm_debugfs_root);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
        struct dentry *debug_purgelist_dentry;
 };
-struct debug_buffer {
-        int len;
-        char *buf;
-};
 struct debug_lockres {
        int dl_len;
        char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa96cfe..7e38a072d72 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
-                mlog(0, "%s: touched %d lockreses in bucket %d "
-                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
        wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c..7009292aac5 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
                if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
                        return 0;
+                if (!dlm_lock_compatible(tmplock->ml.convert_type,
+                                         lock->ml.type))
+                        return 0;
        }
        return 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80..59f0f6bdfc6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res,
-                                      int *numlocks)
+                                      int *numlocks,
+                                      int *hasrefs)
 {
        int ret;
        int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
+        *numlocks = 0;
+        *hasrefs = 0;
        ret = -EINVAL;
        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        }
        *numlocks = count;
-        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (count < O2NM_MAX_NODES)
+                *hasrefs = 1;
+        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
 leave:
        return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
        /*
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
        if (ret < 0) {
                spin_unlock(&res->spinlock);
                goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (numlocks == 0) {
+        if (numlocks == 0 && !hasrefs)
-                mlog(0, "no locks were found on this lockres! done!\n");
                goto leave;
-        }
        /*
         * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * find a node to migrate the lockres to
         */
-        mlog(0, "picking a migration node\n");
        spin_lock(&dlm->spinlock);
        /* pick a new node */
        if (!test_bit(target, dlm->domain_map) ||
            target >= O2NM_MAX_NODES) {
                target = dlm_pick_migration_target(dlm, res);
        }
-        mlog(0, "node %u chosen for migration\n", target);
+        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+             namelen, name, target);
        if (target >= O2NM_MAX_NODES ||
            !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        }
        /* No need to migrate a lockres having no locks */
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0) {
+        if (ret >= 0 && numlocks == 0 && !hasrefs) {
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                }
                queue++;
        }
+        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (nodenum < O2NM_MAX_NODES) {
+                spin_unlock(&res->spinlock);
+                return nodenum;
+        }
        spin_unlock(&res->spinlock);
        mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9..1d6d1d22c47 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
+                        mlog(0, "%s: Adding res %.*s to purge list\n",
-                             res->lockname.len, res->lockname.name, res);
+                             dlm->name, res->lockname.len, res->lockname.name);
                        res->last_used = jiffies;
                        dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        dlm->purge_count++;
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+                mlog(0, "%s: Removing res %.*s from purge list\n",
-                     res->lockname.len, res->lockname.name, res, res->owner);
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        spin_lock(&dlm->spinlock);
        spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        master = (res->owner == dlm->node_num);
+        mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+             res->lockname.len, res->lockname.name, master);
-             res->lockname.name, master);
        if (!master) {
                res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
-                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
        }
        if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purgelist, "
+                mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-                     "master = %d\n", res->lockname.len, res->lockname.name,
+                     dlm->name, res->lockname.len, res->lockname.name, master);
-                     res, master);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
                dlm->purge_count--;
        }
        if (!__dlm_lockres_unused(res)) {
-                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                __dlm_print_one_lock_resource(res);
                BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-                        mlog(0, "lockres %s:%.*s: is in use or "
+                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "being remastered, used %d, state %d\n",
+                             "used %d, state %d\n", dlm->name,
-                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.len, lockres->lockname.name,
-                             lockres->lockname.name, !unused, lockres->state);
+                             !unused, lockres->state);
                        list_move_tail(&dlm->purge_list, &lockres->purge);
                        spin_unlock(&lockres->spinlock);
                        continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
        struct list_head *head;
        int can_grant = 1;
-        //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+        /*
-        //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+         * Because this function is called with the lockres
-        //mlog(0, "shuffle res %.*s\n", res->lockname.len,
-        //        res->lockname.name);
-        /* because this function is called with the lockres
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
-         * basts right before queueing them all throughout */
+         * basts right before queueing them all throughout
+         */
        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
        if (list_empty(&res->converting))
                goto blocked;
-        mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+        mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-             res->lockname.name);
+             res->lockname.len, res->lockname.name);
        target = list_entry(res->converting.next, struct dlm_lock, list);
        if (target->ml.convert_type == LKM_IVMODE) {
-                mlog(ML_ERROR, "%.*s: converting a lock with no "
+                mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-                     "convert_type!\n", res->lockname.len, res->lockname.name);
+                     dlm->name, res->lockname.len, res->lockname.name);
                BUG();
        }
        head = &res->granted;
@@ -365,9 +356,12 @@ converting:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+                mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-                     "granting: %d, node: %u\n", res->lockname.len,
+                     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-                     res->lockname.name, target->ml.type,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+                     target->ml.type,
                     target->ml.convert_type, target->ml.node);
                target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+                mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-                     "node: %u\n", res->lockname.len, res->lockname.name,
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
                     target->ml.type, target->ml.node);
-                // target->ml.type is already correct
+                /* target->ml.type is already correct */
                list_move_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        if (res) {
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                        res->state |= DLM_LOCK_RES_DIRTY;
                }
        }
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-        mlog(0, "starting dlm thread...\n");
+        mlog(0, "Starting dlm_thread...\n");
        dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
        if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
        if (dlm->dlm_thread_task) {
-                mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+                mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
                kthread_stop(dlm->dlm_thread_task);
                dlm->dlm_thread_task = NULL;
        }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* get an extra ref on lock */
                dlm_lock_get(lock);
                res = lock->lockres;
-                mlog(0, "delivering an ast for this lockres\n");
+                mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.node);
                BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another ast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->ast_list)) {
-                        mlog(0, "aha another ast got queued while "
+                        mlog(0, "%s: res %.*s, AST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the ast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                dlm_lock_put(lock);
                spin_unlock(&dlm->ast_lock);
-                mlog(0, "delivering a bast for this lockres "
+                mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-                     "(blocked = %d\n", hi);
+                     "blocked %d, node %u\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     hi, lock->ml.node);
                if (lock->ml.node != dlm->node_num) {
                        ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another bast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->bast_list)) {
-                        mlog(0, "aha another bast got queued while "
+                        mlog(0, "%s: res %.*s, BAST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the bast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
-                                mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+                                mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-                                     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+                                     " dirty %d\n", dlm->name,
-                                     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-                                     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_MIGRATING),
-                                     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+                                     !!(res->state & DLM_LOCK_RES_RECOVERING),
+                                     !!(res->state & DLM_LOCK_RES_DIRTY));
                        }
                        BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                spin_unlock(&dlm->ast_lock);
-                                mlog(0, "delaying list shuffling for in-"
+                                mlog(0, "%s: res %.*s, inprogress, delay list "
-                                     "progress lockres %.*s, state=%d\n",
+                                     "shuffle, state %d\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
                                     res->state);
                                delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%.*s\n", dlm->name,
-                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
                        /* unlikely, but we may need to give time to
                         * other tasks */
                        if (!--n) {
-                                mlog(0, "throttling dlm_thread\n");
+                                mlog(0, "%s: Throttling dlm thread\n",
+                                     dlm->name);
                                break;
                        }
                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19e..8c5c0eddc36 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
        return &ip->ip_vfs_inode;
 }
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
 static void dlmfs_evict_inode(struct inode *inode)
 {
        int status;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af..6adafa57606 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -138,7 +138,7 @@ check_gen:
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
-                result->d_op = &ocfs2_dentry_ops;
+                d_set_d_op(result, &ocfs2_dentry_ops);
        else
                mlog_errno(PTR_ERR(result));
@@ -176,7 +176,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
        if (!IS_ERR(parent))
-                parent->d_op = &ocfs2_dentry_ops;
+                d_set_d_op(parent, &ocfs2_dentry_ops);
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a280..bdadbae0909 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
        return err;
 }
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, ocfs2_check_acl);
+        ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -2241,11 +2244,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
+        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
        if (direct_io) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                /* communicate with ocfs2_dio_end_io */
+                ocfs2_iocb_set_sem_locked(iocb);
        }
        /*
@@ -2382,8 +2389,10 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 out_sems:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        mutex_unlock(&inode->i_mutex);
@@ -2527,6 +2536,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
+        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2545,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        if (filp->f_flags & O_DIRECT) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2575,8 +2587,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 bail:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7..f5afbbef670 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ff5744e1e36..30c52314445 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        ret = d_splice_alias(inode, dentry);
        if (inode) {
@@ -415,7 +415,7 @@ static int ocfs2_mknod(struct inode *dir,
                mlog_errno(status);
                goto leave;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +743,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        ihold(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
 out_commit:
@@ -1017,8 +1017,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
-                if (oi1->ip_blkno != oi2->ip_blkno)
+                if (oi1->ip_blkno != oi2->ip_blkno) {
                        ocfs2_inode_unlock(inode2, 1);
+                        brelse(*bh2);
+                        *bh2 = NULL;
+                }
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1794,7 +1797,7 @@ static int ocfs2_symlink(struct inode *dir,
                mlog_errno(status);
                goto bail;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2462,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
        status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1798f..51cd6898e7f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
        struct inode                    *osb_tl_inode;
        struct buffer_head              *osb_tl_bh;
        struct delayed_work             osb_truncate_log_wq;
+        /*
+         * How many clusters in our truncate log.
+         * It must be protected by osb_tl_inode->i_mutex.
+         */
+        unsigned int truncated_clusters;
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2..bf2e7764920 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
 #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
-#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
 #define NUM_LOCAL_SYSTEM_INODES \
                (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cfeab7ce369..17ff46fa8a1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -569,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
                                                unsigned int cbits)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 911e61f348f..a2a5bff774e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
+static void openprom_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, openprom_i_callback);
+}
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
        struct inode *inode;
diff --git a/fs/pipe.c b/fs/pipe.c
index 04629f36e39..04151e2aee9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT);
                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
                }
                pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
        /* Signal writers asynchronously that there is more room. */
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN);
                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                        do_wakeup = 0;
                }
@@ -623,7 +623,7 @@ redo2:
 out:
        mutex_unlock(&inode->i_mutex);
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        if (!pipe->readers && !pipe->writers) {
                free_pipe_info(inode);
        } else {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
@@ -999,12 +999,12 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
        if (!path.dentry)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
-        path.dentry->d_op = &pipefs_dentry_operations;
+        d_set_d_op(path.dentry, &pipefs_dentry_operations);
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
@@ -1253,6 +1253,10 @@ out:
        return ret;
 }
+static const struct super_operations pipefs_ops = {
+        .destroy_inode = free_inode_nonrcu,
+};
 /*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
@@ -1262,7 +1266,7 @@ out:
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
                         int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+        return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
@@ -1288,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
        unregister_filesystem(&pipe_fs_type);
-        mntput(pipe_mnt);
+        mntput_long(pipe_mnt);
 }
 fs_initcall(init_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748..d42514e3238 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
 */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+        int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
        return (mycount > count);
 }
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
 *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc51..df434c5f28f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
                proc_tty.o
 proc-y  += cmdline.o
+proc-y  += consoles.o
 proc-y  += cpuinfo.o
 proc-y  += devices.o
 proc-y  += interrupts.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676a..df2b703b9d0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
        get_task_comm(tcomm, p);
-        seq_printf(m, "Name:\t");
+        seq_puts(m, "Name:\t");
        end = m->buf + m->size;
        buf = m->buf + m->count;
        name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
                buf++;
        }
        m->count = buf - m->buf;
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 /*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                seq_printf(m, "%d ", GROUP_AT(group_info, g));
        put_cred(cred);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 {
        int i;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        i = _NSIG;
        do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
                seq_printf(m, "%x", x);
        } while (i >= 4);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 {
        unsigned __capi;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        CAP_FOR_EACH_U32(__capi) {
                seq_printf(m, "%08x",
                           a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
        }
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
+        seq_puts(m, "Cpus_allowed:\t");
        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
-        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_puts(m, "Cpus_allowed_list:\t");
        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
-        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+        unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        seq_printf(m, "%d %d %d %d %d %d %d\n",
+        seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
-                        size, resident, shared, text, lib, data, 0);
+                        size, resident, shared, text, data);
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe..93f1cdd5d3d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < 32; i++) {
-                if (task->latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &task->latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %li %li ",
+                        seq_printf(m, "%i %li %li",
-                                task->latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                task->latency_record[i].time,
-                                task->latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!task->latency_record[i].backtrace[q])
                                        break;
-                                if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
-                        seq_printf(m, "\n");
+                        seq_putc(m, '\n');
                }
        }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, proc_single_show, inode);
-        ret = single_open(filp, proc_single_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_single_file_operations = {
@@ -1386,9 +1373,77 @@ sched_write(struct file *file, const char __user *buf,
 static int sched_open(struct inode *inode, struct file *filp)
 {
+        return single_open(filp, sched_show, inode);
+}
+static const struct file_operations proc_pid_sched_operations = {
+        .open           = sched_open,
+        .read           = seq_read,
+        .write          = sched_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_autogroup_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[PROC_NUMBUF];
+        long nice;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &nice);
+        if (err)
+                return -EINVAL;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        err = nice;
+        err = proc_sched_autogroup_set_nice(p, &err);
+        if (err)
+                count = err;
+        put_task_struct(p);
+        return count;
+}
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
        int ret;
-        ret = single_open(filp, sched_show, NULL);
+        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;
@@ -1397,15 +1452,15 @@ static int sched_open(struct inode *inode, struct file *filp)
        return ret;
 }
-static const struct file_operations proc_pid_sched_operations = {
+static const struct file_operations proc_pid_sched_autogroup_operations = {
-        .open           = sched_open,
+        .open           = sched_autogroup_open,
        .read           = seq_read,
-        .write          = sched_write,
+        .write          = sched_autogroup_write,
        .llseek         = seq_lseek,
        .release        = single_release,
 };
-#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
@@ -1454,15 +1509,7 @@ static int comm_show(struct seq_file *m, void *v)
 static int comm_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, comm_show, inode);
-        ret = single_open(filp, comm_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_pid_set_comm_operations = {
@@ -1719,10 +1766,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
@@ -1744,7 +1797,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
        return 0;
 }
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
@@ -1888,12 +1941,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
-        int fd = proc_fd(inode);
+        int fd;
        struct files_struct *files;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        fd = proc_fd(inode);
        if (task) {
                files = get_files_struct(task);
                if (files) {
@@ -1969,7 +2029,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2101,11 +2161,13 @@ static const struct file_operations proc_fd_operations = {
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
 */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int rv;
-        rv = generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        rv = generic_permission(inode, mask, flags, NULL);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
@@ -2137,7 +2199,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2196,7 +2258,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, NULL))
@@ -2563,8 +2625,14 @@ static const struct pid_entry proc_base_stuff[] = {
 */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                put_task_struct(task);
                return 1;
@@ -2615,7 +2683,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &proc_base_dentry_operations;
+        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2733,6 +2801,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
@@ -2926,7 +2997,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
                ARRAY_SIZE(tgid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -3169,7 +3240,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
                ARRAY_SIZE(tid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 00000000000..eafc22ab1fd
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+        static const struct {
+                short flag;
+                char name;
+        } con_flags[] = {
+                { CON_ENABLED,          'E' },
+                { CON_CONSDEV,          'C' },
+                { CON_BOOT,             'B' },
+                { CON_PRINTBUFFER,      'p' },
+                { CON_BRL,              'b' },
+                { CON_ANYTIME,          'a' },
+        };
+        char flags[ARRAY_SIZE(con_flags) + 1];
+        struct console *con = v;
+        unsigned int a;
+        int len;
+        dev_t dev = 0;
+        if (con->device) {
+                const struct tty_driver *driver;
+                int index;
+                driver = con->device(con, &index);
+                if (driver) {
+                        dev = MKDEV(driver->major, driver->minor_start);
+                        dev += index;
+                }
+        }
+        for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+                flags[a] = (con->flags & con_flags[a].flag) ?
+                        con_flags[a].name : ' ';
+        flags[a] = 0;
+        seq_printf(m, "%s%d%n", con->name, con->index, &len);
+        len = 21 - len;
+        if (len < 1)
+                len = 1;
+        seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+                        con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+                        flags);
+        if (dev)
+                seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+        seq_printf(m, "\n");
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        struct console *con;
+        loff_t off = 0;
+        acquire_console_sem();
+        for_each_console(con)
+                if (off++ == *pos)
+                        break;
+        return con;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct console *con = v;
+        ++*pos;
+        return con->next;
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+        release_console_sem();
+}
+static const struct seq_operations consoles_op = {
+        .start  = c_start,
+        .next   = c_next,
+        .stop   = c_stop,
+        .show   = show_console_dev
+};
+static int consoles_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &consoles_op);
+}
+static const struct file_operations proc_consoles_operations = {
+        .open           = consoles_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init proc_consoles_init(void)
+{
+        proc_create("consoles", 0, NULL, &proc_consoles_operations);
+        return 0;
+}
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c..b14347167c3 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
        if (i < CHRDEV_MAJOR_HASH_SIZE) {
                if (i == 0)
-                        seq_printf(f, "Character devices:\n");
+                        seq_puts(f, "Character devices:\n");
                chrdev_show(f, i);
        }
 #ifdef CONFIG_BLOCK
        else {
                i -= CHRDEV_MAJOR_HASH_SIZE;
                if (i == 0)
-                        seq_printf(f, "\nBlock devices:\n");
+                        seq_puts(f, "\nBlock devices:\n");
                blkdev_show(f, i);
        }
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f033766..01e07f2a188 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
        return 1;
 }
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                if (de->namelen != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-                        unsigned int ino;
-                        ino = de->low_ino;
                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
-                        inode = proc_get_inode(dir->i_sb, ino, de);
+                        inode = proc_get_inode(dir->i_sb, de);
                        goto out_unlock;
                }
        }
@@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 out_unlock:
        if (inode) {
-                dentry->d_op = &proc_dentry_operations;
+                d_set_d_op(dentry, &proc_dentry_operations);
                d_add(dentry, inode);
                return NULL;
        }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-        unsigned int ino = de->low_ino;
+        release_inode_number(de->low_ino);
-        if (ino < PROC_DYNAMIC_FIRST)
-                return;
-        release_inode_number(ino);
        if (S_ISLNK(de->mode))
                kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                wait_for_completion(de->pde_unload_completion);
-                goto continue_removing;
+                spin_lock(&de->pde_unload_lock);
        }
-        spin_unlock(&de->pde_unload_lock);
-continue_removing:
-        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3ddb6068177..176ce4cda68 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
+static void proc_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, proc_i_callback);
+}
 static void init_once(void *foo)
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
@@ -409,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
-                                struct proc_dir_entry *de)
 {
        struct inode * inode;
-        inode = iget_locked(sb, ino);
+        inode = iget_locked(sb, de->low_ino);
        if (!inode)
                return NULL;
        if (inode->i_state & I_NEW) {
@@ -464,7 +470,7 @@ int proc_fill_super(struct super_block *s)
        s->s_time_gran = 1;
        
        pde_get(&proc_root);
-        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+        root_inode = proc_get_inode(s, &proc_root);
        if (!root_inode)
                goto out_no_root;
        root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd1..9ad561ded40 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+        unsigned long *, unsigned long *, unsigned long *, unsigned long *);
 void task_mem(struct seq_file *, struct mm_struct *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468..d245cb23dd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b4566033..b06c674624e 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                        ppage = pfn_to_page(pfn);
                else
                        ppage = NULL;
-                if (!ppage)
+                if (!ppage || PageSlab(ppage))
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906..09a1f92a34e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -120,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        err = NULL;
-        dentry->d_op = &proc_sys_dentry_operations;
+        d_set_d_op(dentry, &proc_sys_dentry_operations);
        d_add(dentry, inode);
 out:
@@ -201,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                                dput(child);
                                return -ENOMEM;
                        } else {
-                                child->d_op = &proc_sys_dentry_operations;
+                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
@@ -294,7 +295,7 @@ out:
        return ret;
 }
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
        /*
         * sysctl entries that are not writeable,
@@ -304,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
        struct ctl_table *table;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /* Executable files are not allowed under /proc/sys/ */
        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
                return -EACCES;
@@ -389,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+static int proc_sys_compare(const struct dentry *parent,
-                            struct qstr *name)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+        /* Although proc doesn't have negative dentries, rcu-walk means
-        if (qstr->len != name->len)
+         * that inode here can be NULL */
+        if (!inode)
+                return 0;
+        if (name->len != len)
                return 1;
-        if (memcmp(qstr->name, name->name, name->len))
+        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+        return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..cb761f01030 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
        }
        switch (p->type) {
        case TTY_DRIVER_TYPE_SYSTEM:
-                seq_printf(m, "system");
+                seq_puts(m, "system");
                if (p->subtype == SYSTEM_TYPE_TTY)
-                        seq_printf(m, ":/dev/tty");
+                        seq_puts(m, ":/dev/tty");
                else if (p->subtype == SYSTEM_TYPE_SYSCONS)
-                        seq_printf(m, ":console");
+                        seq_puts(m, ":console");
                else if (p->subtype == SYSTEM_TYPE_CONSOLE)
-                        seq_printf(m, ":vtmaster");
+                        seq_puts(m, ":vtmaster");
                break;
        case TTY_DRIVER_TYPE_CONSOLE:
-                seq_printf(m, "console");
+                seq_puts(m, "console");
                break;
        case TTY_DRIVER_TYPE_SERIAL:
-                seq_printf(m, "serial");
+                seq_puts(m, "serial");
                break;
        case TTY_DRIVER_TYPE_PTY:
                if (p->subtype == PTY_TYPE_MASTER)
-                        seq_printf(m, "pty:master");
+                        seq_puts(m, "pty:master");
                else if (p->subtype == PTY_TYPE_SLAVE)
-                        seq_printf(m, "pty:slave");
+                        seq_puts(m, "pty:slave");
                else
-                        seq_printf(m, "pty");
+                        seq_puts(m, "pty");
                break;
        default:
                seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
                /* pseudo-drivers first */
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
-                seq_printf(m, "system:/dev/tty\n");
+                seq_puts(m, "system:/dev/tty\n");
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
-                seq_printf(m, "system:console\n");
+                seq_puts(m, "system:console\n");
 #ifdef CONFIG_UNIX98_PTYS
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
-                seq_printf(m, "system\n");
+                seq_puts(m, "system\n");
 #endif
 #ifdef CONFIG_VT
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
                seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
-                seq_printf(m, "system:vtmaster\n");
+                seq_puts(m, "system:vtmaster\n");
 #endif
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c98..62604be9f58 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                    ");
+        seq_puts(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        for (i = 0; i < NR_SOFTIRQS; i++) {
                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
-                seq_printf(p, "\n");
+                seq_putc(p, '\n');
        }
        return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93ba..1cffa2b8a2f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
        for (i = 0; i < NR_SOFTIRQS; i++)
                seq_printf(p, " %u", per_softirq_sums[i]);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c126c83b9a4..c3755bd8dd3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
        return PAGE_SIZE * mm->total_vm;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        *shared = get_mm_counter(mm, MM_FILEPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e6384..b535d3e5d5f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
        return vsize;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *p;
-        int size = kobjsize(mm);
+        unsigned long size = kobjsize(mm);
        down_read(&mm->mmap_sem);
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70b..74802bc5ded 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
        /* Do some basic Verification. */
        if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
                (ehdr.e_type != ET_CORE) ||
-                !vmcore_elf_check_arch(&ehdr) ||
+                !vmcore_elf64_check_arch(&ehdr) ||
                ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
                ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
                ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa..e63b4171d58 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
+static void qnx4_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
 static void init_once(void *foo)
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efc..84becd3e477 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 void __quota_error(struct super_block *sb, const char *func,
-                  const char *fmt, ...)
+                   const char *fmt, ...)
 {
-        va_list args;
        if (printk_ratelimit()) {
+                va_list args;
+                struct va_format vaf;
                va_start(args, fmt);
-                printk(KERN_ERR "Quota error (device %s): %s: ",
-                       sb->s_id, func);
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
-                printk("\n");
+                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+                       sb->s_id, func, &vaf);
                va_end(args);
        }
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabc..e41c1becf09 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                quota_error(dquot->dq_sb, "Can't read quota data "
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
-                            "block %u", blk);
+                            *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                quota_error(dquot->dq_sb, "Can't write quota "
+                                quota_error(dquot->dq_sb,
-                                            "tree block %u", blk);
+                                            "Can't write quota tree block %u",
+                                            *blk);
                }
        }
 out_buf:
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f53851..45de98b5946 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int l
        va_list args;
        int mode, first, last;
-        va_start(args, bh);
        if (!bh) {
                printk("print_block: buffer is NULL\n");
                return;
        }
+        va_start(args, bh);
        mode = va_arg(args, int);
        first = va_arg(args, int);
        last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b243117b875..2575682a9ea 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -529,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5d04a7828e7..3cfb2e93364 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
        return err;
 }
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error = -EAGAIN; /* do regular unix permission checks by default */
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
        return 0;
 }
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /*
         * We don't do permission checks on the internal objects.
         * Permissions are determined by the "owning" object.
@@ -965,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
         * Stat data v1 doesn't support ACLs.
         */
        if (get_inode_sd_version(inode) != STAT_DATA_V1)
-                return generic_permission(inode, mask, reiserfs_check_acl);
+                return generic_permission(inode, mask, flags,
+                                        reiserfs_check_acl);
 #endif
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return -EPERM;
 }
@@ -990,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                dentry->d_op = &xattr_lookup_poison_ops;
+                d_set_d_op(dentry, &xattr_lookup_poison_ops);
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55c..2305e3121cb 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
 * return a spent inode to the slab cache
 */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
+static void romfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, romfs_i_callback);
+}
 /*
 * get filesystem statistics
 */
diff --git a/fs/select.c b/fs/select.c
index b7b10aa3086..e56560d2b08 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
                rts.tv_sec = rts.tv_nsec = 0;
        if (timeval) {
+                if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+                        memset(&rtv, 0, sizeof(rtv));
                rtv.tv_sec = rts.tv_sec;
                rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c..20700b9f2b4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9..823e061faa8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_LIST_HEAD(&s->s_files);
 #endif
                INIT_LIST_HEAD(&s->s_instances);
-                INIT_HLIST_HEAD(&s->s_anon);
+                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
                init_rwsem(&s->s_umount);
@@ -1139,7 +1140,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
        return mnt;
 err:
-        mntput(mnt);
+        mntput_long(mnt);
        return ERR_PTR(err);
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b..ea9120a830d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct sysfs_dirent *sd;
        int is_dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        sd = dentry->d_fsdata;
        mutex_lock(&sysfs_mutex);
        /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        /* instantiate and hash dentry */
        ret = d_find_alias(inode);
        if (!ret) {
-                dentry->d_op = &sysfs_dentry_ops;
+                d_set_d_op(dentry, &sysfs_dentry_ops);
                dentry->d_fsdata = sysfs_get(sd);
                d_add(dentry, inode);
        } else {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af..c8769dc222d 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
        struct attribute *const *attr;
        int i;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (!dir_sd)
                return -ENOENT;
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        struct attribute *const *attr;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
                for (attr = grp->attrs; *attr; ++attr)
                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba3..0a12eb89cd3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
                return -ENOENT;
 }
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct sysfs_dirent *sd = inode->i_private;
+        struct sysfs_dirent *sd;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        sd = inode->i_private;
        mutex_lock(&sysfs_mutex);
        sysfs_refresh_inode(sd, inode);
        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e95..3d28af31d86 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
 */
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
@@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e..0630eb969a2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
        return &si->vfs_inode;
 }
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
+static void sysv_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, sysv_i_callback);
+}
 static void init_once(void *p)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd..b5e68da2db3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        /* Truncate the name in place, avoids having to define a compare
           function. */
@@ -47,7 +48,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > SYSV_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
        ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3d9c62be0c1..76712aefc4a 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -346,7 +346,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
        if (sbi->s_forced_ro)
                sb->s_flags |= MS_RDONLY;
        if (sbi->s_truncate)
-                sb->s_root->d_op = &sysv_dentry_operations;
+                d_set_d_op(sb->s_root, &sysv_dentry_operations);
        return 1;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e..6e11c2975dc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
        return &ui->vfs_inode;
 };
+static void ubifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ubifs_inode_slab, ui);
+}
 static void ubifs_destroy_inode(struct inode *inode)
 {
        struct ubifs_inode *ui = ubifs_inode(inode);
        kfree(ui->data);
-        kmem_cache_free(ubifs_inode_slab, inode);
+        call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 /*
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4..0e0e99bd6bc 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
 config UDF_FS
        tristate "UDF file system support"
-        depends on BKL # needs serious work to remove
        select CRC_ITU_T
        help
          This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4ce..306ee39ef2c 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
-                        } else {
-                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
+                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf5022..eb8bfe2b89a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *dir = filp->f_path.dentry->d_inode;
        int result;
-        lock_kernel();
        if (filp->f_pos == 0) {
                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        unlock_kernel();
                        return 0;
                }
                filp->f_pos++;
        }
        result = do_udf_readdir(dir, filp, filldir, dirent);
-        unlock_kernel();
        return result;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c..89c78486cbb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_left;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
                        pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        udf_expand_file_adinicb(inode, pos + count, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
+                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = inode->i_size;
                }
        }
+        up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        long old_block, new_block;
        int result = -EINVAL;
-        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
                udf_debug("no permission to access inode %lu\n", inode->i_ino);
                result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        }
 out:
-        unlock_kernel();
        return result;
 }
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
                mutex_lock(&inode->i_mutex);
-                lock_kernel();
+                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
-                unlock_kernel();
+                up_write(&UDF_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
        return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc..6fb7e0adcda 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                return NULL;
        }
-        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
-                struct logicalVolIntegrityDesc *lvid =
+                struct logicalVolIntegrityDescImpUse *lvidiu;
-                        (struct logicalVolIntegrityDesc *)
-                        sbi->s_lvid_bh->b_data;
+                iinfo->i_unique = lvid_get_unique_id(sb);
-                struct logicalVolIntegrityDescImpUse *lvidiu =
+                mutex_lock(&sbi->s_alloc_mutex);
-                                                        udf_sb_lvidiu(sbi);
+                lvidiu = udf_sb_lvidiu(sbi);
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
                else
                        le32_add_cpu(&lvidiu->numFiles, 1);
-                iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
                udf_updated_lvid(sb);
+                mutex_unlock(&sbi->s_alloc_mutex);
        }
-        mutex_unlock(&sbi->s_alloc_mutex);
        inode_init_owner(inode, dir, mode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2d..c6a2e782b97 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                inode->i_size = 0;
                udf_truncate(inode);
-                lock_kernel();
                udf_update_inode(inode, IS_SYNC(inode));
-                unlock_kernel();
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
        if (want_delete) {
-                lock_kernel();
                udf_free_inode(inode);
-                unlock_kernel();
        }
 }
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
        bh = NULL;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (block == iinfo->i_next_alloc_block + 1) {
                iinfo->i_next_alloc_block++;
                iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, phys);
 abort:
-        unlock_kernel();
+        up_write(&iinfo->i_data_sem);
        return err;
 }
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        lock_kernel();
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                down_write(&iinfo->i_data_sem);
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                 inode->i_size)) {
                        udf_expand_file_adinicb(inode, inode->i_size, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                inode->i_size = iinfo->i_lenAlloc;
-                                unlock_kernel();
+                                up_write(&iinfo->i_data_sem);
                                return;
                        } else
                                udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
                                offset - udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = inode->i_size;
                }
+                up_write(&iinfo->i_data_sem);
        } else {
                block_truncate_page(inode->i_mapping, inode->i_size,
                                    udf_get_block);
+                down_write(&iinfo->i_data_sem);
                udf_truncate_extents(inode);
+                up_write(&iinfo->i_data_sem);
        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
-        unlock_kernel();
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                return;
        }
+        read_lock(&sbi->s_cred_lock);
        inode->i_uid = le32_to_cpu(fe->uid);
        if (inode->i_uid == -1 ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
-                inode->i_nlink = 1;
-        inode->i_size = le64_to_cpu(fe->informationLength);
-        iinfo->i_lenExtents = inode->i_size;
        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
+        read_unlock(&sbi->s_cred_lock);
+        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        if (!inode->i_nlink)
+                inode->i_nlink = 1;
+        inode->i_size = le64_to_cpu(fe->informationLength);
+        iinfo->i_lenExtents = inode->i_size;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        int ret;
+        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        lock_kernel();
-        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
-        return ret;
 }
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
        return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        struct extent_position epos = {};
        int ret;
-        lock_kernel();
+        down_read(&UDF_I(inode)->i_data_sem);
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        else
                ret = 0;
-        unlock_kernel();
+        up_read(&UDF_I(inode)->i_data_sem);
        brelse(epos.bh);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baeb..2be0f9eb86d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-                    isdotdot) {
+                    isdotdot)
-                        brelse(epos.bh);
+                        goto out_ok;
-                        return fi;
-                }
                if (!lfi)
                        continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > UDF_NAME_LEN - 2)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                };
                inode = udf_iget(dir->i_sb, lb);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
                                f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
                if (!fibh->ebh)
                        goto out_err;
+                /* Extents could have been merged, invalidate our position */
+                brelse(epos.bh);
+                epos.bh = NULL;
+                epos.block = dinfo->i_location;
+                epos.offset = udf_file_entry_alloc_offset(dir);
                if (!fibh->soffset) {
-                        if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                        /* Find the freshly allocated block */
-                            (EXT_RECORDED_ALLOCATED >> 30)) {
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-                                block = eloc.logicalBlockNum + ((elen - 1) >>
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        block = eloc.logicalBlockNum + ((elen - 1) >>
                                        dir->i_sb->s_blocksize_bits);
-                        } else
-                                block++;
                        brelse(fibh->sbh);
                        fibh->sbh = fibh->ebh;
                        fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
-                unlock_kernel();
                return err;
        }
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        unlock_kernel();
        d_instantiate(dentry, inode);
        return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
        if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -826,7 +813,6 @@ end_rmdir:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -870,7 +855,6 @@ end_unlink:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        int block;
        unsigned char *name = NULL;
        int namelen;
-        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        struct super_block *sb = dir->i_sb;
-        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
+        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
        }
-        iinfo = UDF_I(inode);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                struct kernel_lb_addr eloc;
                uint32_t bsize;
-                block = udf_new_block(inode->i_sb, inode,
+                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-                bsize = inode->i_sb->s_blocksize;
+                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
-                block = udf_get_pblock(inode->i_sb, block,
+                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tgetblk(inode->i_sb, block);
+                epos.bh = udf_tgetblk(sb, block);
                lock_buffer(epos.bh);
-                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        } else
                ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
-        eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;
        if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(inode->i_sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart, name,
                                                   symname - compstart);
                        if (!namelen)
                                goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                lvid->logicalVolContentsUse;
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
+        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 out:
        kfree(name);
-        unlock_kernel();
        return err;
 out_no_entry:
+        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        iput(inode);
        goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct buffer_head *bh;
-        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-                unlock_kernel();
                return -EMLINK;
        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        mark_inode_dirty(inode);
        ihold(inode);
        d_instantiate(dentry, inode);
-        unlock_kernel();
        return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
                        brelse(nfibh.ebh);
                brelse(nfibh.sbh);
        }
-        unlock_kernel();
        return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
-        lock_kernel();
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
                goto out_unlock;
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
-        unlock_kernel();
        return d_obtain_alias(inode);
 out_unlock:
-        unlock_kernel();
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0..a71090ea0e0 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
+        int ret = 0;
+        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                        break;
                                }
-                        if (!st)
+                        if (!st) {
-                                return 1;
+                                ret = 1;
+                                goto out;
+                        }
                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
-                                return 0;
+                                ret = 0;
+                                goto out;
                        }
-                        return 1;
+                        ret = 1;
+                        goto out;
                } /* if old_block */
        }
        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
-                return 1;
+                ret = 1;
        }
-        return 0;
+out:
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return ret;
 }
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836..7b27b063ff6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -135,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
+        init_rwsem(&ei->i_data_sem);
        return &ei->vfs_inode;
 }
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
+static void udf_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, udf_i_callback);
+}
 static void init_once(void *foo)
 {
        struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -567,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
-        lock_kernel();
+        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
+        write_unlock(&sbi->s_cred_lock);
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -590,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                udf_open_lvid(sb);
 out_unlock:
-        unlock_kernel();
        return error;
 }
@@ -959,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                (sizeof(struct buffer_head *) * nr_groups);
        if (size <= PAGE_SIZE)
-                bitmap = kmalloc(size, GFP_KERNEL);
+                bitmap = kzalloc(size, GFP_KERNEL);
        else
-                bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
                udf_error(sb, __func__,
@@ -970,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                return NULL;
        }
-        memset(bitmap, 0x00, size);
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
        return bitmap;
@@ -1774,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1790,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1802,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1822,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
+}
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+        struct buffer_head *bh;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct logicalVolIntegrityDesc *lvid;
+        struct logicalVolHeaderDesc *lvhd;
+        u64 uniqueID;
+        u64 ret;
+        bh = sbi->s_lvid_bh;
+        if (!bh)
+                return 0;
+        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+        mutex_lock(&sbi->s_alloc_mutex);
+        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+        if (!(++uniqueID & 0xFFFFFFFF))
+                uniqueID += 16;
+        lvhd->uniqueID = cpu_to_le64(uniqueID);
+        mutex_unlock(&sbi->s_alloc_mutex);
+        mark_buffer_dirty(bh);
+        return ret;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1879,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
-        lock_kernel();
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
@@ -1889,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = sbi;
@@ -1929,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
+        rwlock_init(&sbi->s_cred_lock);
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
@@ -2038,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-        unlock_kernel();
        return 0;
 error_out:
@@ -2059,7 +2093,6 @@ error_out:
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
        return -EINVAL;
 }
@@ -2098,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
        sbi = UDF_SB(sb);
-        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2115,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2179,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        uint16_t ident;
        struct spaceBitmapDesc *bm;
-        lock_kernel();
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2217,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                }
        }
        brelse(bh);
 out:
-        unlock_kernel();
        return accum;
 }
@@ -2233,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
        int8_t etype;
        struct extent_position epos;
-        lock_kernel();
+        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;
@@ -2243,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
                accum += (elen >> table->i_sb->s_blocksize_bits);
        brelse(epos.bh);
+        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
-        unlock_kernel();
        return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b..b1d4488b0f1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        int err = -EIO;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
+        uint32_t pos;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        pos = udf_block_map(inode, 0);
+        down_read(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
        } else {
-                bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+                bh = sb_bread(inode->i_sb, pos);
                if (!bh)
                        goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
        brelse(bh);
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 out:
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de4107..d1bd31ea724 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
+        struct rw_semaphore     i_data_sem;
        struct inode vfs_inode;
 };
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c276..4858c191242 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC                 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
        uid_t                   s_uid;
        mode_t                  s_fmode;
        mode_t                  s_dmode;
+        /* Lock protecting consistency of above permission settings */
+        rwlock_t                s_cred_lock;
        /* Root Info */
        struct timespec         s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
        __u16                   s_udfrev;
        /* Miscellaneous flags */
-        __u32                   s_flags;
+        unsigned long           s_flags;
        /* Encoding info */
        struct nls_table        *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
-#define UDF_QUERY_FLAG(X,Y)                     ( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
-#define UDF_SET_FLAG(X,Y)                       ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
+{
-#define UDF_CLEAR_FLAG(X,Y)                     ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+        return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+        set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+        clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f430..eba48209f9f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 /* super.c */
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
        sb->s_dirt = 1;
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56d..2c61ac5d4e4 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
+static void ufs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ufs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c37081..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3..39f4f809bb6 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed..ec7bbb5645b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237..71f721e1a71 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e3..92f1f2acc6a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -828,6 +897,7 @@ xfs_buf_rele(
        if (!pag) {
                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +905,19 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
                        spin_unlock(&pag->pag_buf_lock);
                        bp->b_relse(bp);
+                } else if (!(bp->b_flags & XBF_STALE) &&
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
+                        spin_unlock(&pag->pag_buf_lock);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf9..a76c2428faf 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790e..fc0114da7fd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff7..09649499774 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c115dd5e95a..a10f6416e56 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -834,8 +834,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -847,8 +850,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1..a02480de975 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +833,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +902,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +929,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c59..647af2a2e7a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a1..d22aa310310 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d..11dd72070cb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c02..58632cc17f2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca..fa8723f5870 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -577,61 +577,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +637,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +665,95 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                          args->minlen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1020,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1067,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1084,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb..71e90dc2aeb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7..2f9e97c128a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5..ed2b65f3f8b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
 #endif
 STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
 /*
 * This returns the number of log iovecs needed to log the
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
                        XFS_BUF_SUPER_STALE(bp);
                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                        xfs_buf_ioend(bp, 0);
@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_buf_ioend(bp, 0);
@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
         * We have to unpin the pinned buffers so do the
         * callbacks.
         */
-        xfs_buf_do_callbacks(bp, lip);
+        xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c..b6ecd2061e7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf56..75f2ef60e57 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                if (remove)
-                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
+        if (remove) {
-                xfs_trans_ail_delete(ailp, lip);
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+                xfs_trans_del_item(lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +253,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf6..375f68e4253 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814a..f56d30e8040 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8..cb9b6d1469f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -144,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -366,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f9..be7cf625421 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc..5c95fa8ec11 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c..fd4f398bd6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369..55582bd6665 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +426,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -523,8 +462,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +475,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50..80615760959 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9..0bf24b11d0c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2527,6 +2486,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2638,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2646,17 +2604,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2778,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+        struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        int             tail_cycle, tail_blocks;
-        if (equals)
+        int             cycle, space;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-        else
+        /*
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+         * Check to make sure the grant write head didn't just over lap the
-    } else {
+         * tail.  If the cycles are the same, we can't be overlapping.
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+         * Otherwise, make sure that the cycles differ by exactly one and
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+         * check the byte count.
-    }
+         */
-}       /* xlog_verify_grant_head */
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+        if (tail_cycle != cycle) {
+                ASSERT(cycle - 1 == tail_cycle);
+                ASSERT(space <= BBTOB(tail_blocks));
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97..9dc8125d04e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
@@ -592,7 +587,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -757,7 +752,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617..d5f8be8f4bf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458..204d8e5fa7f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                        item, dip, bp, ino,
+                        item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                        item, dip, bp, ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2422,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c25..d447aef84bc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b498074..a62e8971539 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2..edfa178bafb 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711..f80a067a465 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
 /*
 * Called from the trans_commit code when we notice that
 * the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a8..c2042b736b8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff..c5bbbc45db9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * This function takes the AIL lock once to execute the update operations on
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * all the items in the array, and as such should not be called with the AIL
- * log manager.
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * To optimise the insert operation, we delete all the items from the AIL in
- * is dropped before returning.
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * removed from the AIL. The caller is already holding the AIL lock, and done
- * log manager.
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * bump the AIL's generation count to indicate that the tree
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * has changed.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function will not drop the AIL lock until all items are removed from
- * is dropped before returning.
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
-                lip->li_flags &= ~XFS_LI_IN_AIL;
+        for (i = 0; i < nr_items; i++) {
-                lip->li_lsn = 0;
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 /*
- * Insert the given log item into the AIL.
+ * splice the log item list into the AIL at the given LSN.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
 */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
+        struct list_head *list,
-/* ARGSUSED */
+        xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
+                list_splice(list, &ailp->xa_ail);
                return;
        }
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
+        list_splice_init(list, &next_lip->li_ail);
        return;
 }
 /*
 * Delete the given item from the AIL.  Return a pointer to the item.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC xfs_log_item_t *
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
-        return lip;
 }
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa7..f7590f5bade 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de..35162c238fa 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+static inline void
-                                        struct xfs_log_item *lip)
+xfs_trans_ail_update(
-                                        __releases(ailp->xa_lock);
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
 void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151..d8e6f8cd6f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
-                         */
-                        error = xfs_free_eofblocks(mp, ip,
-                                                   XFS_FREE_EOF_TRYLOCK);
-                        if (error)
-                                return error;
-                }
-        }
+                /*
+                 * If we can't get the iolock just skip truncating the blocks
+                 * past EOF because we could deadlock with the mmap_sem
+                 * otherwise.  We'll get another chance to drop them once the
+                 * last reference to the inode is dropped, so we'll never leak
+                 * blocks permanently.
+                 *
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * oustanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }