Merge branch 'next' into for-linus

author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2011-03-19 02:38:50 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2011-03-19 02:38:50 -0400
commit: 97eb3f24352ec6632c2127b35d8087d2a809a9b9 (patch)
tree: 722948059bbd325bbca232269490124231df80d4 /fs
parent: 439581ec07fa9cf3f519dd461a2cf41cfd3adcb4 (diff)
parent: def179c271ac9b5020deca798470521f14d11edd (diff)
600 files changed, 28864 insertions, 17806 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e051147679..814ac4e213a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
          If unsure, say N.
+if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
 config 9P_FS_POSIX_ACL
        bool "9P POSIX Access Control Lists"
-        depends on 9P_FS
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1..ab8c1278063 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
        vfs_super.o \
        vfs_inode.o \
+        vfs_inode_dotl.o \
        vfs_addr.o \
        vfs_file.o \
        vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351db..02a2cf61631 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
        ssize_t size;
        void *value = NULL;
-        struct posix_acl *acl = NULL;;
+        struct posix_acl *acl = NULL;
        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
        if (size > 0) {
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
        return acl;
 }
-int v9fs_check_acl(struct inode *inode, int mask)
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        struct v9fs_session_info *v9ses;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
                /*
@@ -362,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
                if (!S_ISDIR(inode->i_mode)) {
-                        retval = -EINVAL;
+                        retval = acl ? -EINVAL : 0;
                        goto err_out;
                }
                break;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7..7ef3ac9f6d9 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
                               struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2..c4b5d8864f0 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                        struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                        void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000L;
 }
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f..b789f8e597e 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
-void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f393..233b7d4ffe5 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
 *
 */
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
                                                                        dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 *
 */
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry)
 *
 */
-void v9fs_dentry_release(struct dentry *dentry)
+static void v9fs_dentry_release(struct dentry *dentry)
 {
        struct v9fs_dentry *dent;
        struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 34bf71b5654..b76a40bdf4c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -237,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 *
 */
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
-#endif
-/**
+void v9fs_destroy_inode(struct inode *inode)
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-        BUG_ON(dir_inode == NULL);
-        if (dir_inode->i_mode & S_ISGID) {
-                /* set_gid bit is set.*/
-                return dir_inode->i_gid;
-        }
-        return current_fsgid();
-}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
-        struct dentry *dentry;
+        call_rcu(&inode->i_rcu, v9fs_i_callback);
-        spin_lock(&dcache_lock);
-        /* Directory should have only one entry. */
-        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&dcache_lock);
-        return dentry;
 }
+#endif
 /**
 * v9fs_get_inode - helper function to setup an inode
@@ -447,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
@@ -482,60 +447,6 @@ error:
        return ERR_PTR(err);
 }
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
-{
-        struct inode *ret = NULL;
-        int err;
-        struct p9_stat_dotl *st;
-        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
-        kfree(st);
-        return ERR_PTR(err);
-}
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                        struct super_block *sb)
-{
-        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
-        else
-                return v9fs_inode(v9ses, fid, sb);
-}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -626,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -650,144 +555,6 @@ error:
 }
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                struct nameidata *nd)
-{
-        int err = 0;
-        char *name = NULL;
-        gid_t gid;
-        int flags;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
-        struct file *filp;
-        struct p9_qid qid;
-        struct inode *inode;
-        struct posix_acl *pacl = NULL, *dacl = NULL;
-        v9ses = v9fs_inode2v9ses(dir);
-        if (nd && nd->flags & LOOKUP_OPEN)
-                flags = nd->intent.open.flags - 1;
-        else {
-                /*
-                 * create call without LOOKUP_OPEN is due
-                 * to mknod of regular files. So use mknod
-                 * operation.
-                 */
-                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
-        }
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, omode);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        /* clone a fid to use for creation */
-        ofid = p9_client_walk(dfid, 0, NULL, 1);
-        if (IS_ERR(ofid)) {
-                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in creat %d\n", err);
-                goto error;
-        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                                "p9_client_open_dotl failed in creat %d\n",
-                                err);
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
-            (nd && nd->flags & LOOKUP_OPEN)) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                /* The fid would get clunked via a dput */
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-        /* if we are opening a file, assign the open fid to the file */
-        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-                if (IS_ERR(filp)) {
-                        p9_client_clunk(ofid);
-                        return PTR_ERR(filp);
-                }
-                filp->private_data = ofid;
-        } else
-                p9_client_clunk(ofid);
-        return 0;
-error:
-        if (ofid)
-                p9_client_clunk(ofid);
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -877,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                               struct dentry *dentry, int omode)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        gid_t gid;
-        char *name;
-        mode_t mode;
-        struct inode *inode;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-        err = 0;
-        v9ses = v9fs_inode2v9ses(dir);
-        omode |= S_IFDIR;
-        if (dir->i_mode & S_ISGID)
-                omode |= S_ISGID;
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mkdir %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -986,7 +652,7 @@ error:
 *
 */
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
        struct super_block *sb;
@@ -1033,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                goto error_iput;
 inst_out:
-        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
-                dentry->d_op = &v9fs_dentry_operations;
        d_add(dentry, inode);
        return NULL;
@@ -1056,7 +717,7 @@ error:
 *
 */
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 0);
 }
@@ -1068,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 *
 */
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 1);
 }
@@ -1082,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 *
 */
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1189,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-                 struct kstat *stat)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-                return simple_getattr(mnt, dentry, stat);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        /* Ask for all the fields in stat structure. Server will return
-         * whatever it supports
-         */
-        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-        if (IS_ERR(st))
-                return PTR_ERR(st);
-        v9fs_stat2inode_dotl(st, dentry->d_inode);
-        generic_fillattr(dentry->d_inode, stat);
-        /* Change block size to what the server returned */
-        stat->blksize = st->st_blksize;
-        kfree(st);
-        return 0;
-}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -1284,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
-        retval = inode_change_ok(dentry->d_inode, iattr);
-        if (retval)
-                return retval;
-        p9attr.valid = iattr->ia_valid;
-        p9attr.mode = iattr->ia_mode;
-        p9attr.uid = iattr->ia_uid;
-        p9attr.gid = iattr->ia_gid;
-        p9attr.size = iattr->ia_size;
-        p9attr.atime_sec = iattr->ia_atime.tv_sec;
-        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
-        setattr_copy(dentry->d_inode, iattr);
-        mark_inode_dirty(dentry->d_inode);
-        if (iattr->ia_valid & ATTR_MODE) {
-                /* We also want to update ACL when we update mode bits */
-                retval = v9fs_acl_chmod(dentry);
-                if (retval < 0)
-                        return retval;
-        }
-        return 0;
-}
-/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -1419,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-                inode->i_atime.tv_sec = stat->st_atime_sec;
-                inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                inode->i_uid = stat->st_uid;
-                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                i_size_write(inode, stat->st_size);
-                inode->i_blocks = stat->st_blocks;
-        } else {
-                if (stat->st_result_mask & P9_STATS_ATIME) {
-                        inode->i_atime.tv_sec = stat->st_atime_sec;
-                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_MTIME) {
-                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_CTIME) {
-                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_UID)
-                        inode->i_uid = stat->st_uid;
-                if (stat->st_result_mask & P9_STATS_GID)
-                        inode->i_gid = stat->st_gid;
-                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
-                if (stat->st_result_mask & P9_STATS_MODE) {
-                        inode->i_mode = stat->st_mode;
-                        if ((S_ISBLK(inode->i_mode)) ||
-                                                (S_ISCHR(inode->i_mode)))
-                                init_special_inode(inode, inode->i_mode,
-                                                                inode->i_rdev);
-                }
-                if (stat->st_result_mask & P9_STATS_RDEV)
-                        inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if (stat->st_result_mask & P9_STATS_SIZE)
-                        i_size_write(inode, stat->st_size);
-                if (stat->st_result_mask & P9_STATS_BLOCKS)
-                        inode->i_blocks = stat->st_blocks;
-        }
-        if (stat->st_result_mask & P9_STATS_GEN)
-                        inode->i_generation = stat->st_gen;
-        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-         * because the inode structure does not have fields for them.
-         */
-}
-/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1595,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 *
 */
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
@@ -1639,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-                const char *symname)
-{
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
-        int err;
-        gid_t gid;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-                        dir->i_ino, name, symname);
-        v9ses = v9fs_inode2v9ses(dir);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache) {
-                /* Now walk from the parent so we can get an unopened fid. */
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
-                        fid = NULL;
-                        goto error;
-                }
-                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1785,77 +1193,6 @@ clunk_fid:
 }
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-                struct dentry *dentry)
-{
-        int err;
-        struct p9_fid *dfid, *oldfid;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
-                        dentry->d_name.name);
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid))
-                return PTR_ERR(dfid);
-        oldfid = v9fs_fid_lookup(old_dentry);
-        if (IS_ERR(oldfid))
-                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
-        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-                return err;
-        }
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                /* Get the latest stat info from server. */
-                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
-                fid = v9fs_fid_lookup(old_dentry);
-                if (IS_ERR(fid))
-                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
-        }
-        dentry->d_op = old_dentry->d_op;
-        d_instantiate(dentry, old_dentry->d_inode);
-        return err;
-}
-/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1900,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                dev_t rdev)
-{
-        int err;
-        char *name;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        struct inode *inode;
-        gid_t gid;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS,
-                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
-        if (!new_valid_dev(rdev))
-                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mknod %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate inode with stat.
-                 * socket syscall returns a fd, so we need instantiate
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                dentry->d_op = &v9fs_dentry_operations;
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-static int
-v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
-{
-        int retval;
-        struct p9_fid *fid;
-        char *target = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
-        retval = -EPERM;
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_readlink(fid, &target);
-        if (retval < 0)
-                return retval;
-        strncpy(buffer, target, buflen);
-        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
-        retval = strnlen(buffer, buflen);
-        return retval;
-}
-/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
-{
-        int len = 0;
-        char *link = __getname();
-        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
-        if (!link)
-                link = ERR_PTR(-ENOMEM);
-        else {
-                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else
-                        link[min(len, PATH_MAX-1)] = 0;
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2068,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create_dotl,
-        .lookup = v9fs_vfs_lookup,
-        .link = v9fs_vfs_link_dotl,
-        .symlink = v9fs_vfs_symlink_dotl,
-        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir_dotl,
-        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
-        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2104,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -2122,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link_dotl,
-        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 00000000000..fe3ffa9aace
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&inode->i_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&inode->i_lock);
+        return dentry;
+}
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else {
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, omode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+        /* Since we are opening a file, assign the open fid to the file */
+        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+        if (IS_ERR(filp)) {
+                p9_client_clunk(ofid);
+                return PTR_ERR(filp);
+        }
+        filp->private_data = ofid;
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+                               struct dentry *dentry, int omode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        mode_t mode;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
+        return 0;
+}
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
+                 */
+                ihold(old_dentry->d_inode);
+        }
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *link = __getname();
+        char *target;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto ndset;
+        }
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                __putname(link);
+                link = ERR_PTR(PTR_ERR(fid));
+                goto ndset;
+        }
+        retval = p9_client_readlink(fid, &target);
+        if (!retval) {
+                strcpy(link, target);
+                kfree(target);
+                goto ndset;
+        }
+        __putname(link);
+        link = ERR_PTR(retval);
+ndset:
+        nd_set_link(nd, link);
+        return NULL;
+}
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link_dotl,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500a..dbaabe3b813 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        }
        v9fs_fill_super(sb, v9ses, flags, data);
+        if (v9ses->cache)
+                sb->s_d_op = &v9fs_cached_dentry_operations;
+        else
+                sb->s_d_op = &v9fs_dentry_operations;
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
-        if (s->s_root)
-                v9fs_dentry_release(s->s_root); /* clunk root */
        kill_anon_super(s);
        v9fs_session_cancel(v9ses);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df8433..d288773871b 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
                        "p9_client_xattrcreate failed %d\n", retval);
                goto error;
        }
-        msize = fid->clnt->msize;;
+        msize = fid->clnt->msize;
        while (value_len) {
                if (value_len > (msize - P9_IOHDRSZ))
                        write_count = msize - P9_IOHDRSZ;
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d..3db9caa57ed 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-#       Never use this symbol for ifdefs.
-#
-        bool
-        default n
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
 endif # BLOCK
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+        def_bool n
 config EXPORTFS
        tristate
 config FILE_LOCKING
-        bool "Enable POSIX file locking API" if EMBEDDED
+        bool "Enable POSIX file locking API" if EXPERT
        default y
        help
          This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de74..3b4a764ed78 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+                struct qstr *qstr)
 {
        const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
        const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
 * requirements of the underlying filesystem.
 */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i;
-        if (entry->len != name->len)
+        if (len != name->len)
                return 1;
        for (i = 0; i < name->len; i++) {
                char a, b;
-                a = entry->name[i];
+                a = str[i];
                b = name->name[i];
                if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        dentry->d_op = &adfs_dentry_operations; 
        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42..2d7954049fb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
+static void adfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, adfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -466,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                asb->s_namelen = ADFS_F_NAME_LEN;
        }
+        sb->s_d_op = &adfs_dentry_operations;
        root = adfs_iget(sb, &root_obj);
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
@@ -476,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                kfree(asb->s_map);
                adfs_error(sb, "get root inode failed\n");
                goto error;
-        } else
+        }
-                sb->s_root->d_op = &adfs_dentry_operations;
        unlock_kernel();
        return 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb3402..0e95f73a702 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations     affs_aops_ofs;
 extern const struct dentry_operations    affs_dentry_operations;
+extern const struct dentry_operations    affs_intl_dentry_operations;
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a..3a4557e8325 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
        void *data = dentry->d_fsdata;
        struct list_head *head, *next;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        head = &inode->i_dentry;
        next = head->next;
        while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07..e3e9efc1fdd 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,26 @@
 typedef int (*toupper_t)(int);
 static int       affs_toupper(int ch);
-static int       affs_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_hash_dentry(const struct dentry *,
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 static int       affs_intl_toupper(int ch);
-static int       affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_intl_hash_dentry(const struct dentry *,
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 const struct dentry_operations affs_dentry_operations = {
        .d_hash         = affs_hash_dentry,
        .d_compare      = affs_compare_dentry,
 };
-static const struct dentry_operations affs_intl_dentry_operations = {
+const struct dentry_operations affs_intl_dentry_operations = {
        .d_hash         = affs_intl_hash_dentry,
        .d_compare      = affs_intl_compare_dentry,
 };
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
 * Note: the dentry argument is the parent dentry.
 */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
        int i;
-        i = affs_check_name(qstr->name,qstr->len);
+        i = affs_check_name(qstr->name, qstr->len);
        if (i)
                return i;
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_toupper);
+        return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+        return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
-static inline int
+static inline int __affs_compare_dentry(unsigned int len,
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+                const char *str, const struct qstr *name, toupper_t toupper)
 {
-        const u8 *aname = a->name;
+        const u8 *aname = str;
-        const u8 *bname = b->name;
+        const u8 *bname = name->name;
-        int len;
-        /* 'a' is the qstr of an already existing dentry, so the name
+        /*
-         * must be valid. 'b' must be validated first.
+         * 'str' is the name of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (affs_check_name(b->name,b->len))
+        if (affs_check_name(name->name, name->len))
                return 1;
-        /* If the names are longer than the allowed 30 chars,
+        /*
+         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        len = a->len;
        if (len >= 30) {
-                if (b->len < 30)
+                if (name->len < 30)
                        return 1;
                len = 30;
-        } else if (len != b->len)
+        } else if (len != name->len)
                return 1;
        for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_toupper);
+        return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 /*
@@ -226,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
-        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cb..b31507d0f9b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
        return &i->vfs_inode;
 }
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
+static void affs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, affs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -470,12 +477,16 @@ got_root:
                goto out_error_noinode;
        }
+        if (AFFS_SB(sb)->s_flags & SF_INTL)
+                sb->s_d_op = &affs_intl_dentry_operations;
+        else
+                sb->s_d_op = &affs_dentry_operations;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root) {
                printk(KERN_ERR "AFFS: Get root inode failed\n");
                goto out_error;
        }
-        sb->s_root->d_op = &affs_dentry_operations;
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54..1c8c6cc6de3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_CallBack);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_Probe);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a8..20c106f2492 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
@@ -61,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
        .setattr        = afs_setattr,
 };
-static const struct dentry_operations afs_fs_dentry_operations = {
+const struct dentry_operations afs_fs_dentry_operations = {
        .d_revalidate   = afs_d_revalidate,
        .d_delete       = afs_d_delete,
        .d_release      = afs_d_release,
+        .d_automount    = afs_d_automount,
 };
 #define AFS_DIR_HASHTBL_SIZE    128
@@ -581,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        }
 success:
-        dentry->d_op = &afs_fs_dentry_operations;
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
               fid.vnode,
@@ -607,6 +607,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
        void *dir_version;
        int ret;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        vnode = AFS_FS_I(dentry->d_inode);
        if (dentry->d_inode)
@@ -730,7 +733,7 @@ out_bad:
 * - called from dput() when d_count is going to 0.
 * - return 1 to request dentry be unhashed, 0 otherwise
 */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
        _enter("%s", dentry->d_name.name);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c..db66c520147 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
        inode->i_generation     = 0;
        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-        inode->i_flags |= S_NOATIME;
+        set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+        inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
        unlock_new_inode(inode);
        _leave(" = %p", inode);
        return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736f..5a9b6843bac 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 * dir.c
 */
 extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
 /*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
 /*
 * main.c
 */
+extern struct workqueue_struct *afs_wq;
 extern struct afs_uuid afs_uuid;
 /*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
+extern struct vfsmount *afs_d_automount(struct path *);
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
@@ -624,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 /*
 * server.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b2..42dd2e499ed 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
 /*
 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
+        /* create workqueue */
+        ret = -ENOMEM;
+        afs_wq = alloc_workqueue("afs", 0, 0);
+        if (!afs_wq)
+                return ret;
        /* register the /proc stuff */
        ret = afs_proc_init();
        if (ret < 0)
-                return ret;
+                goto error_proc;
 #ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
 error_cache:
 #endif
        afs_proc_cleanup();
+error_proc:
+        destroy_workqueue(afs_wq);
        rcu_barrier();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
        return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
        afs_purge_servers();
        afs_callback_update_kill();
        afs_vlocation_purge();
-        flush_scheduled_work();
+        destroy_workqueue(afs_wq);
        afs_cell_purge();
 #ifdef CONFIG_AFS_FSCACHE
        fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf5..aa59184151d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct dentry *dentry,
                                       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
 const struct inode_operations afs_mntpt_inode_operations = {
        .lookup         = afs_mntpt_lookup,
-        .follow_link    = afs_mntpt_follow_link,
        .readlink       = page_readlink,
        .getattr        = afs_getattr,
 };
 const struct inode_operations afs_autocell_inode_operations = {
-        .follow_link    = afs_mntpt_follow_link,
        .getattr        = afs_getattr,
 };
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
                _debug("symlink is a mountpoint");
                spin_lock(&vnode->lock);
                set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+                vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
                spin_unlock(&vnode->lock);
        }
@@ -238,52 +236,24 @@ error_no_devname:
 }
 /*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
 */
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
 {
        struct vfsmount *newmnt;
-        int err;
-        _enter("%p{%s},{%s:%p{%s},}",
+        _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
-               dentry,
-               dentry->d_name.name,
-               nd->path.mnt->mnt_devname,
-               dentry,
-               nd->path.dentry->d_name.name);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        newmnt = afs_mntpt_do_automount(nd->path.dentry);
+        newmnt = afs_mntpt_do_automount(path->dentry);
-        if (IS_ERR(newmnt)) {
+        if (IS_ERR(newmnt))
-                path_put(&nd->path);
+                return newmnt;
-                return (void *)newmnt;
-        }
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        _leave(" = %d", err);
+        mntget(newmnt); /* prevent immediate expiration */
-        return ERR_PTR(err);
+        mnt_set_expiry(newmnt, &afs_vfsmounts);
+        queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+                           afs_mntpt_expiry_timeout * HZ);
+        _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+        return newmnt;
 }
 /*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
        if (!list_empty(&afs_vfsmounts)) {
                mark_mounts_for_expiry(&afs_vfsmounts);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
+                                   afs_mntpt_expiry_timeout * HZ);
        }
        _leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
        _enter("");
        ASSERT(list_empty(&afs_vfsmounts));
-        cancel_delayed_work(&afs_mntpt_expiry_timer);
+        cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
-        flush_scheduled_work();
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01..e45a323aebb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
        if (!call) {
                /* its an incoming call for our callback service */
                skb_queue_tail(&afs_incoming_calls, skb);
-                schedule_work(&afs_collect_incoming_call_work);
+                queue_work(afs_wq, &afs_collect_incoming_call_work);
        } else {
                /* route the messages directly to the appropriate call */
                skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e..f44b9d35537 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 * - AFS ACLs are attached to directories only, and a file is controlled by its
 *   parent directory's ACL
 */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_access_t uninitialized_var(access);
        struct key *key;
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        _enter("{{%x:%u},%lx},%x,",
               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
        }
        key_put(key);
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, flags, NULL);
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7b..d59b7516e94 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
        if (atomic_read(&server->usage) == 0) {
                list_move_tail(&server->grave, &afs_server_graveyard);
                server->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_server_reaper,
+                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                      afs_server_timeout * HZ);
+                                   afs_server_timeout * HZ);
        }
        spin_unlock(&afs_server_graveyard_lock);
        _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
                expiry = server->time_of_death + afs_server_timeout;
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
-                        if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+                        if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+                                                delay)) {
                                cancel_delayed_work(&afs_server_reaper);
-                                schedule_delayed_work(&afs_server_reaper,
+                                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
 {
        afs_server_timeout = 0;
        cancel_delayed_work(&afs_server_reaper);
-        schedule_delayed_work(&afs_server_reaper, 0);
+        queue_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece..fb240e8766d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        if (!root)
                goto error;
+        sb->s_d_op = &afs_fs_dentry_operations;
        sb->s_root = root;
        _leave(" = 0");
@@ -498,6 +499,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        return &vnode->vfs_inode;
 }
+static void afs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(afs_inode_cachep, vnode);
+}
 /*
 * destroy an AFS inode struct
 */
@@ -511,7 +520,7 @@ static void afs_destroy_inode(struct inode *inode)
        ASSERTCMP(vnode->server, ==, NULL);
-        kmem_cache_free(afs_inode_cachep, vnode);
+        call_rcu(&inode->i_rcu, afs_i_callback);
        atomic_dec(&afs_count_active_inodes);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361..431984d2e37 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
                _debug("buried");
                list_move_tail(&vl->grave, &afs_vlocation_graveyard);
                vl->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_vlocation_reap,
+                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                      afs_vlocation_timeout * HZ);
+                                   afs_vlocation_timeout * HZ);
                /* suspend updates on this record */
                if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
                        _debug("delay %lu", delay);
-                        if (!schedule_delayed_work(&afs_vlocation_reap,
+                        if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                   delay)) {
+                                                delay)) {
                                cancel_delayed_work(&afs_vlocation_reap);
-                                schedule_delayed_work(&afs_vlocation_reap,
+                                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
        destroy_workqueue(afs_vlocation_update_worker);
        cancel_delayed_work(&afs_vlocation_reap);
-        schedule_delayed_work(&afs_vlocation_reap, 0);
+        queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 /*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d7..fc557a3be0a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
        aio_wq = create_workqueue("aio");
        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-        BUG_ON(!abe_pool);
+        BUG_ON(!aio_wq || !abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
        queue_delayed_work(aio_wq, &ctx->wq, timeout);
 }
-/*
- * aio_run_iocbs:
- *      Process all pending retries queued on the ioctx
- *      run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
-        int requeue;
-        spin_lock_irq(&ctx->ctx_lock);
-        requeue = __aio_run_iocbs(ctx);
-        spin_unlock_irq(&ctx->ctx_lock);
-        if (requeue)
-                aio_queue_work(ctx);
-}
 /*
- * just like aio_run_iocbs, but keeps running them until
+ * aio_run_all_iocbs:
- * the list stays empty
+ *      Process all pending retries queued on the ioctx
+ *      run list, and keep running them until the list
+ *      stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
 */
 static inline void aio_run_all_iocbs(struct kioctx *ctx)
 {
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
        long ret = -EINVAL;
        if (likely(ioctx)) {
-                if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+                if (likely(min_nr <= nr && min_nr >= 0))
                        ret = read_events(ioctx, min_nr, nr, events, timeout);
                put_ioctx(ioctx);
        }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 57ce55b2564..c5567cb7843 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-                                int flags, const char *dev_name, void *data)
-{
-        return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-}
 /*
 * anon_inodefs_dname() is called from d_path().
 */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
                                dentry->d_name.name);
 }
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+        .d_dname        = anon_inodefs_dname,
+};
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+                                int flags, const char *dev_name, void *data)
+{
+        return mount_pseudo(fs_type, "anon_inode:", NULL,
+                        &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
 static struct file_system_type anon_inode_fs_type = {
        .name           = "anon_inodefs",
        .mount          = anon_inodefs_mount,
        .kill_sb        = kill_anon_super,
 };
-static const struct dentry_operations anon_inodefs_dentry_operations = {
-        .d_dname        = anon_inodefs_dname,
-};
 /*
 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
 };
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
- *                    anonymous inode, and a dentry that describe the "class"
+ *                      anonymous inode, and a dentry that describe the "class"
- *                    of the file
+ *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
@@ -102,7 +104,7 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
        if (!path.dentry)
                goto err_module;
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
         */
        ihold(anon_inode_inode);
-        path.dentry->d_op = &anon_inodefs_dentry_operations;
        d_instantiate(path.dentry, anon_inode_inode);
        error = -ENFILE;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d..54f92379272 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
+extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
@@ -85,18 +88,9 @@ struct autofs_info {
        uid_t uid;
        gid_t gid;
-        mode_t  mode;
-        size_t  size;
-        void (*free)(struct autofs_info *);
-        union {
-                const char *symlink;
-        } u;
 };
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING      (1<<2) /* dentry pending mount */
 struct autofs_wait_queue {
@@ -173,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
        return 0;
 }
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
-{
-        dst->f_path.dentry->d_inode->i_atime =
-                src->f_path.dentry->d_inode->i_atime;
-        return;
-}
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
 void autofs4_free_ino(struct autofs_info *);
 /* Expiration */
@@ -209,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
 extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+/* VFS automount flags management functions */
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+        dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+        dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
 /* Initializing function */
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
 /* Queue management functions */
@@ -226,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct path *path)
-{
-        int res = 0;
-        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(path);
-                if (!followed)
-                        break;
-                res = 1;
-        }
-        return res;
-}
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
        return new_encode_dev(sbi->sb->s_dev);
@@ -254,17 +301,15 @@ static inline int simple_positive(struct dentry *dentry)
        return dentry->d_inode && !d_unhashed(dentry);
 }
-static inline int __simple_empty(struct dentry *dentry)
+static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
-        struct dentry *child;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        int ret = 0;
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+                if (list_empty(&ino->expiring))
-                if (simple_positive(child))
+                        list_add(&ino->expiring, &sbi->expiring_list);
-                        goto out;
+        }
-        ret = 1;
+        return;
-out:
-        return ret;
 }
 static inline void autofs4_add_expiring(struct dentry *dentry)
@@ -293,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
        return;
 }
-void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469..1442da4860e 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                err = have_submounts(path.dentry);
-                if (follow_down(&path))
+                if (follow_down_one(&path))
                        magic = path.mnt->mnt_sb->s_magic;
        }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb..f43100b9662 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
        if (ino == NULL)
                return 0;
-        /* No point expiring a pending mount */
-        if (ino->flags & AUTOFS_INF_PENDING)
-                return 0;
        if (!do_now) {
                /* Too young to die */
                if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        path_get(&path);
-        if (!follow_down(&path))
+        if (!follow_down_one(&path))
                goto done;
        if (is_autofs4_dentry(path.dentry)) {
@@ -91,24 +87,64 @@ done:
 }
 /*
- * Calculate next entry in top down tree traversal.
+ * Calculate and dget next entry in top down tree traversal.
- * From next_mnt in namespace.c - elegant.
 */
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+                                                struct dentry *root)
 {
-        struct list_head *next = p->d_subdirs.next;
+        struct list_head *next;
+        struct dentry *p, *ret;
+        if (prev == NULL)
+                return dget(root);
+        spin_lock(&autofs4_lock);
+relock:
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_subdirs.next;
        if (next == &p->d_subdirs) {
                while (1) {
-                        if (p == root)
+                        struct dentry *parent;
+                        if (p == root) {
+                                spin_unlock(&p->d_lock);
+                                spin_unlock(&autofs4_lock);
+                                dput(prev);
                                return NULL;
+                        }
+                        parent = p->d_parent;
+                        if (!spin_trylock(&parent->d_lock)) {
+                                spin_unlock(&p->d_lock);
+                                cpu_relax();
+                                goto relock;
+                        }
+                        spin_unlock(&p->d_lock);
                        next = p->d_u.d_child.next;
-                        if (next != &p->d_parent->d_subdirs)
+                        p = parent;
+                        if (next != &parent->d_subdirs)
                                break;
-                        p = p->d_parent;
                }
        }
-        return list_entry(next, struct dentry, d_u.d_child);
+        ret = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(ret)) {
+                spin_unlock(&p->d_lock);
+                p = ret;
+                goto again;
+        }
+        dget_dlock(ret);
+        spin_unlock(&ret->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&autofs4_lock);
+        dput(prev);
+        return ret;
 }
 /*
@@ -158,18 +194,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        if (!simple_positive(top))
                return 1;
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = top; p; p = next_dentry(p, top)) {
+        while ((p = get_next_positive_dentry(p, top))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                /*
                 * Is someone visiting anywhere in the subtree ?
                 * If there's no mount we need to check the usage
@@ -198,16 +227,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
-                        if (atomic_read(&p->d_count) > ino_count) {
+                        if (p->d_count > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
                        }
                }
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        /* Timeout of a tree mount is ultimately determined by its top dentry */
        if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +252,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
        DPRINTK("parent %p %.*s",
                parent, (int)parent->d_name.len, parent->d_name.name);
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = parent; p; p = next_dentry(p, parent)) {
+        while ((p = get_next_positive_dentry(p, parent))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                if (d_mountpoint(p)) {
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, p))
-                                goto cont;
+                                continue;
                        /* Can we expire this guy */
                        if (autofs4_can_expire(p, timeout, do_now))
                                return p;
                }
-cont:
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 }
@@ -264,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        unsigned long timeout;
        struct dentry *root = dget(sb->s_root);
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
+        struct autofs_info *ino;
        if (!root)
                return NULL;
@@ -272,17 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        timeout = sbi->exp_timeout;
        spin_lock(&sbi->fs_lock);
+        ino = autofs4_dentry_ino(root);
+        /* No point expiring a pending mount */
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                return NULL;
+        }
+        managed_dentry_set_transit(root);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
-                if (d_mountpoint(root)) {
-                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
-                        root->d_mounted--;
-                }
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                return root;
        }
+        managed_dentry_clear_transit(root);
        spin_unlock(&sbi->fs_lock);
        dput(root);
@@ -302,8 +322,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 {
        unsigned long timeout;
        struct dentry *root = sb->s_root;
+        struct dentry *dentry;
        struct dentry *expired = NULL;
-        struct list_head *next;
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
        struct autofs_info *ino;
@@ -315,25 +335,14 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        now = jiffies;
        timeout = sbi->exp_timeout;
-        spin_lock(&dcache_lock);
+        dentry = NULL;
-        next = root->d_subdirs.next;
+        while ((dentry = get_next_positive_dentry(dentry, root))) {
-        /* On exit from the loop expire is set to a dgot dentry
-         * to expire or it's NULL */
-        while ( next != &root->d_subdirs ) {
-                struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-                /* Negative dentry - give up */
-                if (!simple_positive(dentry)) {
-                        next = next->next;
-                        continue;
-                }
-                dentry = dget(dentry);
-                spin_unlock(&dcache_lock);
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
+                /* No point expiring a pending mount */
+                if (ino->flags & AUTOFS_INF_PENDING)
+                        goto cont;
+                managed_dentry_set_transit(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -347,7 +356,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 2;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        /* Can we umount this guy */
@@ -369,7 +378,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +392,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -393,12 +402,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
+                managed_dentry_clear_transit(dentry);
+cont:
                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-                next = next->next;
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 found:
@@ -408,9 +415,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&expired->d_parent->d_lock);
+        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&expired->d_lock);
+        spin_unlock(&expired->d_parent->d_lock);
+        spin_unlock(&autofs4_lock);
        return expired;
 }
@@ -473,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
+        if (!d_unhashed(dentry))
+                managed_dentry_clear_transit(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -498,11 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        sb->s_root->d_mounted++;
-                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
+                spin_lock(&dentry->d_lock);
+                if (ret)
+                        __managed_dentry_clear_transit(dentry);
+                else {
+                        if ((IS_ROOT(dentry) ||
+                            (autofs_type_indirect(sbi->type) &&
+                             IS_ROOT(dentry->d_parent))) &&
+                            !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                                __managed_dentry_set_automount(dentry);
+                }
+                spin_unlock(&dentry->d_lock);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac87e49fa70..180fa2425e4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
 #include "autofs_i.h"
 #include <linux/module.h>
-static void ino_lnkfree(struct autofs_info *ino)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-        if (ino->u.symlink) {
+        struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
-                kfree(ino->u.symlink);
+        if (ino) {
-                ino->u.symlink = NULL;
-        }
-}
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-                                     struct autofs_sb_info *sbi, mode_t mode)
-{
-        int reinit = 1;
-        if (ino == NULL) {
-                reinit = 0;
-                ino = kmalloc(sizeof(*ino), GFP_KERNEL);
-        }
-        if (ino == NULL)
-                return NULL;
-        if (!reinit) {
-                ino->flags = 0;
-                ino->inode = NULL;
-                ino->dentry = NULL;
-                ino->size = 0;
                INIT_LIST_HEAD(&ino->active);
-                ino->active_count = 0;
                INIT_LIST_HEAD(&ino->expiring);
-                atomic_set(&ino->count, 0);
+                ino->last_used = jiffies;
+                ino->sbi = sbi;
        }
+        return ino;
+}
+void autofs4_clean_ino(struct autofs_info *ino)
+{
        ino->uid = 0;
        ino->gid = 0;
-        ino->mode = mode;
        ino->last_used = jiffies;
-        ino->sbi = sbi;
-        if (reinit && ino->free)
-                (ino->free)(ino);
-        memset(&ino->u, 0, sizeof(ino->u));
-        ino->free = NULL;
-        if (S_ISLNK(mode))
-                ino->free = ino_lnkfree;
-        return ino;
 }
 void autofs4_free_ino(struct autofs_info *ino)
 {
-        struct autofs_info *p_ino;
-        if (ino->dentry) {
-                ino->dentry->d_fsdata = NULL;
-                if (ino->dentry->d_inode) {
-                        struct dentry *parent = ino->dentry->d_parent;
-                        if (atomic_dec_and_test(&ino->count)) {
-                                p_ino = autofs4_dentry_ino(parent);
-                                if (p_ino && parent != ino->dentry)
-                                        atomic_dec(&p_ino->count);
-                        }
-                        dput(ino->dentry);
-                }
-                ino->dentry = NULL;
-        }
-        if (ino->free)
-                (ino->free)(ino);
        kfree(ino);
 }
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static void autofs4_evict_inode(struct inode *inode)
+{
+        end_writeback(inode);
+        kfree(inode->i_private);
+}
 static const struct super_operations autofs4_sops = {
        .statfs         = simple_statfs,
        .show_options   = autofs4_show_options,
+        .evict_inode    = autofs4_evict_inode,
 };
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
        return (*pipefd < 0);
 }
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
-        struct autofs_info *ino;
-        ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
-        if (!ino)
-                return NULL;
-        return ino;
-}
-static const struct dentry_operations autofs4_sb_dentry_operations = {
-        .d_release      = autofs4_dentry_release,
-};
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
        struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        s->s_blocksize_bits = 10;
        s->s_magic = AUTOFS_SUPER_MAGIC;
        s->s_op = &autofs4_sops;
+        s->s_d_op = &autofs4_dentry_operations;
        s->s_time_gran = 1;
        /*
         * Get the root inode and dentry, but defer checking for errors.
         */
-        ino = autofs4_mkroot(sbi);
+        ino = autofs4_new_ino(sbi);
        if (!ino)
                goto fail_free;
-        root_inode = autofs4_get_inode(s, ino);
+        root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
        if (!root_inode)
                goto fail_ino;
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_iput;
        pipe = NULL;
-        root->d_op = &autofs4_sb_dentry_operations;
        root->d_fsdata = ino;
        /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_dput;
        }
+        if (autofs_type_trigger(sbi->type))
+                __managed_dentry_set_managed(root);
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = autofs_type_trigger(sbi->type) ?
+        root_inode->i_op = &autofs4_dir_inode_operations;
-                        &autofs4_direct_root_inode_operations :
-                        &autofs4_indirect_root_inode_operations;
        /* Couldn't this be tested earlier? */
        if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
        return -EINVAL;
 }
-struct inode *autofs4_get_inode(struct super_block *sb,
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
-                                struct autofs_info *inf)
 {
        struct inode *inode = new_inode(sb);
        if (inode == NULL)
                return NULL;
-        inf->inode = inode;
+        inode->i_mode = mode;
-        inode->i_mode = inf->mode;
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_ino = get_next_ino();
-        if (S_ISDIR(inf->mode)) {
+        if (S_ISDIR(mode)) {
                inode->i_nlink = 2;
                inode->i_op = &autofs4_dir_inode_operations;
                inode->i_fop = &autofs4_dir_operations;
-        } else if (S_ISLNK(inf->mode)) {
+        } else if (S_ISLNK(mode)) {
-                inode->i_size = inf->size;
                inode->i_op = &autofs4_symlink_inode_operations;
        }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f003..014e7aba3b0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
 #include "autofs_i.h"
+DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -33,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool, bool);
-#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+static void autofs4_dentry_release(struct dentry *);
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
 const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
@@ -58,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
        .llseek         = dcache_dir_lseek,
 };
-const struct inode_operations autofs4_indirect_root_inode_operations = {
+const struct inode_operations autofs4_dir_inode_operations = {
        .lookup         = autofs4_lookup,
        .unlink         = autofs4_dir_unlink,
        .symlink        = autofs4_dir_symlink,
@@ -66,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
-const struct inode_operations autofs4_direct_root_inode_operations = {
+const struct dentry_operations autofs4_dentry_operations = {
-        .lookup         = autofs4_lookup,
+        .d_automount    = autofs4_d_automount,
-        .unlink         = autofs4_dir_unlink,
+        .d_manage       = autofs4_d_manage,
-        .mkdir          = autofs4_dir_mkdir,
+        .d_release      = autofs4_dentry_release,
-        .rmdir          = autofs4_dir_rmdir,
-        .follow_link    = autofs4_follow_link,
-};
-const struct inode_operations autofs4_dir_inode_operations = {
-        .lookup         = autofs4_lookup,
-        .unlink         = autofs4_dir_unlink,
-        .symlink        = autofs4_dir_symlink,
-        .mkdir          = autofs4_dir_mkdir,
-        .rmdir          = autofs4_dir_rmdir,
 };
 static void autofs4_add_active(struct dentry *dentry)
@@ -114,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
        return;
 }
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
-        unsigned int res = 0;
-        if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
-                res = 1;
-        return res;
-}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -142,275 +125,41 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOENT;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
 out:
        return dcache_dir_open(inode, file);
 }
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
+static void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int status;
-        DPRINTK("dentry=%p %.*s ino=%p",
-                 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        /*
-         * Wait for a pending mount, triggering one if there
-         * isn't one already
-         */
-        if (dentry->d_inode == NULL) {
-                DPRINTK("waiting for mount name=%.*s",
-                         dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                /* Turn this into a real negative dentry? */
-                if (status == -ENOENT) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                } else if (status) {
-                        /* Return a negative dentry, but leave it "pending" */
-                        return status;
-                }
-        /* Trigger mount for path component or follow link */
-        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags)) {
-                DPRINTK("waiting for mount name=%.*s",
-                        dentry->d_name.len, dentry->d_name.name);
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                if (status) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                }
-        }
-        /* Initialize expiry counter after successful mount */
-        ino->last_used = jiffies;
-        spin_lock(&sbi->fs_lock);
-        ino->flags &= ~AUTOFS_INF_PENDING;
-        spin_unlock(&sbi->fs_lock);
-        return 0;
-}
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int oz_mode = autofs4_oz_mode(sbi);
-        unsigned int lookup_type;
-        int status;
-        DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
-                dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
-                nd->flags);
-        /*
-         * For an expire of a covered direct or offset mount we need
-         * to break out of follow_down() at the autofs mount trigger
-         * (d_mounted--), so we can see the expiring flag, and manage
-         * the blocking and following here until the expire is completed.
-         */
-        if (oz_mode) {
-                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_EXPIRING) {
-                        spin_unlock(&sbi->fs_lock);
-                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path))
-                                goto done;
-                        goto follow;
-                }
-                spin_unlock(&sbi->fs_lock);
-                goto done;
-        }
-        /* If an expire request is pending everyone must wait. */
-        autofs4_expire_wait(dentry);
-        /* We trigger a mount for almost all flags */
-        lookup_type = autofs4_need_mount(nd->flags);
-        spin_lock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
-        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-                spin_unlock(&dcache_lock);
-                spin_unlock(&sbi->fs_lock);
-                goto follow;
-        }
-        /*
-         * If the dentry contains directories then it is an autofs
-         * multi-mount with no root mount offset. So don't try to
-         * mount it again.
-         */
-        if (ino->flags & AUTOFS_INF_PENDING ||
-            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-                spin_unlock(&dcache_lock);
-                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, nd->flags);
-                if (status)
-                        goto out_error;
-                goto follow;
-        }
-        spin_unlock(&dcache_lock);
-        spin_unlock(&sbi->fs_lock);
-follow:
-        /*
-         * If there is no root mount it must be an autofs
-         * multi-mount with no root offset so we don't need
-         * to follow it.
-         */
-        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path)) {
-                        status = -ENOENT;
-                        goto out_error;
-                }
-        }
-done:
-        return NULL;
-out_error:
-        path_put(&nd->path);
-        return ERR_PTR(status);
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct autofs_info *ino = autofs4_dentry_ino(de);
-        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+        struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-        int oz_mode = autofs4_oz_mode(sbi);
-        int flags = nd ? nd->flags : 0;
-        int status = 1;
-        /* Pending dentry */
-        spin_lock(&sbi->fs_lock);
-        if (autofs4_ispending(dentry)) {
-                /* The daemon never causes a mount to trigger */
-                spin_unlock(&sbi->fs_lock);
-                if (oz_mode)
-                        return 1;
-                /*
-                 * If the directory has gone away due to an expire
-                 * we have been called as ->d_revalidate() and so
-                 * we need to return false and proceed to ->lookup().
-                 */
-                if (autofs4_expire_wait(dentry) == -EAGAIN)
-                        return 0;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&sbi->fs_lock);
-        /* Negative dentry.. invalidate if "old" */
-        if (dentry->d_inode == NULL)
-                return 0;
-        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&dcache_lock);
-        if (S_ISDIR(dentry->d_inode->i_mode) &&
-            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                DPRINTK("dentry=%p %.*s, emptydir",
-                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dcache_lock);
-                /* The daemon never causes a mount to trigger */
-                if (oz_mode)
-                        return 1;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&dcache_lock);
-        return 1;
-}
-void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_info *inf;
        DPRINTK("releasing %p", de);
-        inf = autofs4_dentry_ino(de);
+        if (!ino)
-        de->d_fsdata = NULL;
+                return;
-        if (inf) {
-                struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-                if (sbi) {
-                        spin_lock(&sbi->lookup_lock);
-                        if (!list_empty(&inf->active))
-                                list_del(&inf->active);
-                        if (!list_empty(&inf->expiring))
-                                list_del(&inf->expiring);
-                        spin_unlock(&sbi->lookup_lock);
-                }
-                inf->dentry = NULL;
-                inf->inode = NULL;
-                autofs4_free_ino(inf);
+        if (sbi) {
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->active))
+                        list_del(&ino->active);
+                if (!list_empty(&ino->expiring))
+                        list_del(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
        }
-}
-/* For dentries of directories in the root dir */
+        autofs4_free_ino(ino);
-static const struct dentry_operations autofs4_root_dentry_operations = {
+}
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
@@ -422,7 +171,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -436,7 +185,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&active->d_count) == 0)
+                if (active->d_count == 0)
                        goto next;
                qstr = &active->d_name;
@@ -452,17 +201,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        goto next;
                if (d_unhashed(active)) {
-                        dget(active);
+                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -477,7 +226,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -507,66 +256,261 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        goto next;
                if (d_unhashed(expiring)) {
-                        dget(expiring);
+                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        return NULL;
+}
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                DPRINTK("waiting for mount name=%.*s",
+                        dentry->d_name.len, dentry->d_name.name);
+                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+                DPRINTK("mount wait done status=%d", status);
+                ino->last_used = jiffies;
+                return status;
+        }
+        return 0;
+}
+static int do_expire_wait(struct dentry *dentry)
+{
+        struct dentry *expiring;
+        expiring = autofs4_lookup_expiring(dentry);
+        if (!expiring)
+                return autofs4_expire_wait(dentry);
+        else {
+                /*
+                 * If we are racing with expire the request might not
+                 * be quite complete, but the directory has been removed
+                 * so it must have been successful, just wait for it.
+                 */
+                autofs4_expire_wait(expiring);
+                autofs4_del_expiring(expiring);
+                dput(expiring);
+        }
+        return 0;
+}
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        /*
+         * If this is an indirect mount the dentry could have gone away
+         * as a result of an expire and a new one created.
+         */
+        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+                struct dentry *parent = dentry->d_parent;
+                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                if (!new)
+                        return NULL;
+                dput(path->dentry);
+                path->dentry = new;
+        }
+        return path->dentry;
+}
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /*
+         * Someone may have manually umounted this or it was a submount
+         * that has gone away.
+         */
+        spin_lock(&dentry->d_lock);
+        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+                if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                        __managed_dentry_set_transit(path->dentry);
+        }
+        spin_unlock(&dentry->d_lock);
+        /* The daemon never triggers a mount. */
+        if (autofs4_oz_mode(sbi))
+                return NULL;
+        /*
+         * If an expire request is pending everyone must wait.
+         * If the expire fails we're still mounted so continue
+         * the follow and return. A return of -EAGAIN (which only
+         * happens with indirect mounts) means the expire completed
+         * and the directory was removed, so just go ahead and try
+         * the mount.
+         */
+        status = do_expire_wait(dentry);
+        if (status && status != -EAGAIN)
+                return NULL;
+        /* Callback to the daemon to perform the mount or wait */
+        spin_lock(&sbi->fs_lock);
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                goto done;
+        }
+        /*
+         * If the dentry is a symlink it's equivalent to a directory
+         * having d_mountpoint() true, so there's no need to call back
+         * to the daemon.
+         */
+        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+                goto done;
+        if (!d_mountpoint(dentry)) {
+                /*
+                 * It's possible that user space hasn't removed directories
+                 * after umounting a rootless multi-mount, although it
+                 * should. For v5 have_submounts() is sufficient to handle
+                 * this because the leaves of the directory tree under the
+                 * mount never trigger mounts themselves (they have an autofs
+                 * trigger mount mounted on them). But v4 pseudo direct mounts
+                 * do need the leaves to to trigger mounts. In this case we
+                 * have no choice but to use the list_empty() check and
+                 * require user space behave.
+                 */
+                if (sbi->version > 4) {
+                        if (have_submounts(dentry))
+                                goto done;
+                } else {
+                        spin_lock(&dentry->d_lock);
+                        if (!list_empty(&dentry->d_subdirs)) {
+                                spin_unlock(&dentry->d_lock);
+                                goto done;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                ino->flags |= AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+        }
+done:
+        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+                /*
+                 * Any needed mounting has been completed and the path updated
+                 * so turn this into a normal dentry so we don't continually
+                 * call ->d_automount() and ->d_manage().
+                 */
+                spin_lock(&dentry->d_lock);
+                __managed_dentry_clear_transit(dentry);
+                /*
+                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+                 * symlinks as in all other cases the dentry will be covered by
+                 * an actual mount so ->d_automount() won't be called during
+                 * the follow.
+                 */
+                if ((!d_mountpoint(dentry) &&
+                    !list_empty(&dentry->d_subdirs)) ||
+                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+                        __managed_dentry_clear_automount(dentry);
+                spin_unlock(&dentry->d_lock);
+        }
+        spin_unlock(&sbi->fs_lock);
+        /* Mount succeeded, check if we ended up with a new dentry */
+        dentry = autofs4_mountpoint_changed(path);
+        if (!dentry)
+                return ERR_PTR(-ENOENT);
        return NULL;
 }
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /* The daemon never waits. */
+        if (autofs4_oz_mode(sbi) || mounting_here) {
+                if (!d_mountpoint(dentry))
+                        return -EISDIR;
+                return 0;
+        }
+        /* We need to sleep, so we need pathwalk to be in ref-mode */
+        if (rcu_walk)
+                return -ECHILD;
+        /* Wait for pending expires */
+        do_expire_wait(dentry);
+        /*
+         * This dentry may be under construction so wait on mount
+         * completion.
+         */
+        return autofs4_mount_wait(dentry);
+}
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
-        struct dentry *expiring, *active;
+        struct dentry *active;
-        int oz_mode;
-        DPRINTK("name = %.*s",
+        DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
-                dentry->d_name.len, dentry->d_name.name);
        /* File name too long to exist */
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        sbi = autofs4_sbi(dir->i_sb);
-        oz_mode = autofs4_oz_mode(sbi);
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+                current->pid, task_pgrp_nr(current), sbi->catatonic,
+                autofs4_oz_mode(sbi));
        active = autofs4_lookup_active(dentry);
        if (active) {
-                dentry = active;
+                return active;
-                ino = autofs4_dentry_ino(dentry);
        } else {
                /*
-                 * Mark the dentry incomplete but don't hash it. We do this
+                 * A dentry that is not within the root can never trigger a
-                 * to serialize our inode creation operations (symlink and
+                 * mount operation, unless the directory already exists, so we
-                 * mkdir) which prevents deadlock during the callback to
+                 * can return fail immediately.  The daemon however does need
-                 * the daemon. Subsequent user space lookups for the same
+                 * to create directories within the file system.
-                 * dentry are placed on the wait queue while the daemon
-                 * itself is allowed passage unresticted so the create
-                 * operation itself can then hash the dentry. Finally,
-                 * we check for the hashed dentry and return the newly
-                 * hashed dentry.
                 */
-                dentry->d_op = &autofs4_root_dentry_operations;
+                if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+                        return ERR_PTR(-ENOENT);
-                /*
+                /* Mark entries in the root as mount triggers */
-                 * And we need to ensure that the same dentry is used for
+                if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
-                 * all following lookup calls until it is hashed so that
+                        __managed_dentry_set_managed(dentry);
-                 * the dentry flags are persistent throughout the request.
-                 */
+                ino = autofs4_new_ino(sbi);
-                ino = autofs4_init_ino(NULL, sbi, 0555);
                if (!ino)
                        return ERR_PTR(-ENOMEM);
@@ -577,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                d_instantiate(dentry, NULL);
        }
-        if (!oz_mode) {
-                mutex_unlock(&dir->i_mutex);
-                expiring = autofs4_lookup_expiring(dentry);
-                if (expiring) {
-                        /*
-                         * If we are racing with expire the request might not
-                         * be quite complete but the directory has been removed
-                         * so it must have been successful, so just wait for it.
-                         */
-                        autofs4_expire_wait(expiring);
-                        autofs4_del_expiring(expiring);
-                        dput(expiring);
-                }
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        (dentry->d_op->d_revalidate)(dentry, nd);
-                mutex_lock(&dir->i_mutex);
-        }
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (ino->flags & AUTOFS_INF_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                            if (active)
-                                dput(active);
-                            return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-                if (!oz_mode) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                }
-        }
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup.  Returning ENOENT here doesn't do the right thing
-         * for all system calls, but it should be OK for the operations
-         * we permit from an autofs.
-         */
-        if (!oz_mode && d_unhashed(dentry)) {
-                /*
-                 * A user space application can (and has done in the past)
-                 * remove and re-create this directory during the callback.
-                 * This can leave us with an unhashed dentry, but a
-                 * successful mount!  So we need to perform another
-                 * cached lookup in case the dentry now exists.
-                 */
-                struct dentry *parent = dentry->d_parent;
-                struct dentry *new = d_lookup(parent, &dentry->d_name);
-                if (new != NULL)
-                        dentry = new;
-                else
-                        dentry = ERR_PTR(-ENOENT);
-                if (active)
-                        dput(active);
-                return dentry;
-        }
-        if (active)
-                return active;
        return NULL;
 }
@@ -664,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
        struct inode *inode;
+        size_t size = strlen(symname);
        char *cp;
        DPRINTK("%s <- %.*s", symname,
@@ -672,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        ino->size = strlen(symname);
+        cp = kmalloc(size + 1, GFP_KERNEL);
-        cp = kmalloc(ino->size + 1, GFP_KERNEL);
+        if (!cp)
-        if (!cp) {
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        strcpy(cp, symname);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
        if (!inode) {
                kfree(cp);
                if (!dentry->d_fsdata)
                        kfree(ino);
                return -ENOMEM;
        }
+        inode->i_private = cp;
+        inode->i_size = size;
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        dget(dentry);
-                dentry->d_op = &autofs4_root_dentry_operations;
-        else
-                dentry->d_op = &autofs4_dentry_operations;
-        dentry->d_fsdata = ino;
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
-        ino->u.symlink = cp;
        dir->i_mtime = CURRENT_TIME;
        return 0;
@@ -753,16 +612,68 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return 0;
 }
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+        struct dentry *parent;
+        /* root and dentrys in the root are already handled */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_set_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        managed_dentry_clear_managed(parent);
+        return;
+}
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+        struct list_head *d_child;
+        struct dentry *parent;
+        /* flags for dentrys in the root are handled elsewhere */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_clear_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        d_child = &dentry->d_u.d_child;
+        /* Set parent managed if it's becoming empty */
+        if (d_child->next == &parent->d_subdirs &&
+            d_child->prev == &parent->d_subdirs)
+                managed_dentry_set_managed(parent);
+        return;
+}
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -775,16 +686,23 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
+        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        if (sbi->version < 5)
+                autofs_clear_leaf_automount_flags(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -814,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DPRINTK("dentry %p, creating %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
-        ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
-        if (!inode) {
+        if (!inode)
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        if (sbi->version < 5)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                autofs_set_leaf_automount_flags(dentry);
-        else
-                dentry->d_op = &autofs4_dentry_operations;
-        dentry->d_fsdata = ino;
+        dget(dentry);
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
        inc_nlink(dir);
        dir->i_mtime = CURRENT_TIME;
@@ -921,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
        return dentry && dentry->d_inode &&
-                (dentry->d_op == &autofs4_root_dentry_operations ||
+                dentry->d_op == &autofs4_dentry_operations &&
-                 dentry->d_op == &autofs4_dentry_operations) &&
                dentry->d_fsdata != NULL;
 }
@@ -980,19 +890,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        }
 }
-static DEFINE_MUTEX(autofs4_ioctl_mutex);
 static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
-        long ret;
        struct inode *inode = filp->f_dentry->d_inode;
+        return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_lock(&autofs4_ioctl_mutex);
-        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_unlock(&autofs4_ioctl_mutex);
-        return ret;
 }
 #ifdef CONFIG_COMPAT
@@ -1002,13 +904,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret;
-        mutex_lock(&autofs4_ioctl_mutex);
        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
                        (unsigned long)compat_ptr(arg));
-        mutex_unlock(&autofs4_ioctl_mutex);
        return ret;
 }
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2..f27c094a191 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        nd_set_link(nd, dentry->d_inode->i_private);
-        nd_set_link(nd, (char *)ino->u.symlink);
        return NULL;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f..56010056b2e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
        struct dentry *root = sbi->sb->s_root;
        struct dentry *tmp;
-        char *buf = *name;
+        char *buf;
        char *p;
-        int len = 0;
+        int len;
+        unsigned seq;
-        spin_lock(&dcache_lock);
+rename_retry:
+        buf = *name;
+        len = 0;
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
+        spin_lock(&autofs4_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&autofs4_lock);
+                rcu_read_unlock();
+                if (read_seqretry(&rename_lock, seq))
+                        goto rename_retry;
                return 0;
        }
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        return len;
 }
@@ -296,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
         * completed while we waited on the mutex ...
         */
        if (notify == NFY_MOUNT) {
+                struct dentry *new = NULL;
+                int valid = 1;
                /*
                 * If the dentry was successfully mounted while we slept
                 * on the wait queue mutex we can return success. If it
@@ -303,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
                 * a multi-mount with no mount at it's base) we can
                 * continue on and create a new request.
                 */
+                if (!IS_ROOT(dentry)) {
+                        if (dentry->d_inode && d_unhashed(dentry)) {
+                                struct dentry *parent = dentry->d_parent;
+                                new = d_lookup(parent, &dentry->d_name);
+                                if (new)
+                                        dentry = new;
+                        }
+                }
                if (have_submounts(dentry))
-                        return 0;
+                        valid = 0;
+                if (new)
+                        dput(new);
+                return valid;
        }
        return 1;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aadde..9ad2369d9e3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
        return -EIO;
 }
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return -EIO;
 }
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d0..27223878ba9 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
 {
        befs_data_stream data;
        int i;
        for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
-                data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+                data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
-        data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
+        data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
-        data.indirect = fsrun_to_cpu(sb, n.indirect);
+        data.indirect = fsrun_to_cpu(sb, n->indirect);
-        data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
+        data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
-        data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+        data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
        data.max_double_indirect_range = fs64_to_cpu(sb,
-                                                     n.
+                                                     n->
                                                     max_double_indirect_range);
-        data.size = fs64_to_cpu(sb, n.size);
+        data.size = fs64_to_cpu(sb, n->size);
        return data;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c..b1d0c794747 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void
+static void befs_i_callback(struct rcu_head *head)
-befs_destroy_inode(struct inode *inode)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
+static void befs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, befs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                int num_blks;
                befs_ino->i_data.ds =
-                    fsds_to_cpu(sb, raw_inode->data.datastream);
+                    fsds_to_cpu(sb, &raw_inode->data.datastream);
                num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
                inode->i_blocks =
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49b..a8e37f81d09 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
+static void bfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bfs_inode_info *bi = foo;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c..d5b640ba6cb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 static struct linux_binfmt elf_format = {
-                .module         = THIS_MODULE,
+        .module         = THIS_MODULE,
-                .load_binary    = load_elf_binary,
+        .load_binary    = load_elf_binary,
-                .load_shlib     = load_elf_library,
+        .load_shlib     = load_elf_library,
-                .core_dump      = elf_core_dump,
+        .core_dump      = elf_core_dump,
-                .min_coredump   = ELF_EXEC_PAGESIZE,
+        .min_coredump   = ELF_EXEC_PAGESIZE,
-                .hasvdso        = 1
 };
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        return 0;
 }
-#ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
                struct elf_phdr *eppnt, int prot, int type,
                unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        return(map_addr);
 }
-#endif /* !elf_map */
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
        int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out;
        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-                             (char *)elf_phdata,size);
+                             (char *)elf_phdata, size);
        error = -EIO;
        if (retval != size) {
                if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out;
        if (!elf_check_arch(&loc->elf_ex))
                goto out;
-        if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
                goto out;
        /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        /* There was a PT_LOAD segment with p_memsz > p_filesz
                           before this one. Map anonymous pages, if needed,
                           and clear the area.  */
-                        retval = set_brk (elf_bss + load_bias,
+                        retval = set_brk(elf_bss + load_bias,
-                                          elf_brk + load_bias);
+                                         elf_brk + load_bias);
                        if (retval) {
                                send_sig(SIGKILL, current, 0);
                                goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b..e49cce234c6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
 {
        unsigned int i;
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        /*
+         * kintegrityd won't block much but may burn a lot of CPU cycles.
+         * Make it highpri CPU intensive wq with max concurrency of 1.
+         */
+        kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+                                         WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7..4bd454fa844 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio;
+        if (nr_iovecs > UIO_MAXIOV)
+                return NULL;
        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
                      gfp_mask);
        if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+        struct bio_map_data *bmd;
+        if (iov_count > UIO_MAXIOV)
+                return NULL;
+        bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                len += iov[i].iov_len;
        }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                unsigned long start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                /*
                 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long start = uaddr >> PAGE_SHIFT;
                const int local_nr_pages = end - start;
                const int page_limit = cur_page + local_nr_pages;
-                
                ret = get_user_pages_fast(uaddr, local_nr_pages,
                                write_to_vm, &pages[cur_page]);
                if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97..333a7bb4cb9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/smp_lock.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
@@ -410,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bdev_cachep, bdi);
 }
+static void bdev_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bdev_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bdev_inode *ei = (struct bdev_inode *) foo;
@@ -427,7 +433,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&bdev->bd_inodes);
        INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
-        INIT_LIST_HEAD(&bdev->bd_holder_list);
+        INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
        inode_init_once(&ei->vfs_inode);
        /* Initialize mutex for freeze. */
@@ -467,7 +473,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 static struct file_system_type bd_type = {
@@ -663,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
        else if (bdev->bd_contains == bdev)
                return true;     /* is a whole device which isn't held */
-        else if (whole->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_may_claim)
                return true;     /* is a partition of a device that is being partitioned */
        else if (whole->bd_holder != NULL)
                return false;    /* is a partition of a held device */
@@ -775,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
        }
 }
-/* releases bdev_lock */
+#ifdef CONFIG_SYSFS
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
+struct bd_holder_disk {
-{
+        struct list_head        list;
-        BUG_ON(whole->bd_claiming != holder);
+        struct gendisk          *disk;
-        whole->bd_claiming = NULL;
+        int                     refcnt;
-        wake_up_bit(&whole->bd_claiming, 0);
+};
-        spin_unlock(&bdev_lock);
-        bdput(whole);
-}
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming().  Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
-}
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
-                                        void *holder)
-{
-        /* note that for a whole device bd_holders
-         * will be incremented twice, and bd_holder will
-         * be set to bd_claim before being set to holder
-         */
-        whole->bd_holders++;
-        whole->bd_holder = bd_claim;
-        bdev->bd_holders++;
-        bdev->bd_holder = holder;
-}
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
-                                struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        BUG_ON(!bd_may_claim(bdev, whole, holder));
-        __bd_claim(bdev, whole, holder);
-        __bd_abort_claiming(whole, holder); /* not actually an abort */
-}
-/**
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- * bd_claim - claim a block device
+                                                  struct gendisk *disk)
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
 {
-        struct block_device *whole = bdev->bd_contains;
+        struct bd_holder_disk *holder;
-        int res;
-        might_sleep();
+        list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+                if (holder->disk == disk)
-        spin_lock(&bdev_lock);
+                        return holder;
-        res = bd_prepare_to_claim(bdev, whole, holder);
+        return NULL;
-        if (res == 0)
-                __bd_claim(bdev, whole, holder);
-        spin_unlock(&bdev_lock);
-        return res;
-}
-EXPORT_SYMBOL(bd_claim);
-void bd_release(struct block_device *bdev)
-{
-        spin_lock(&bdev_lock);
-        if (!--bdev->bd_contains->bd_holders)
-                bdev->bd_contains->bd_holder = NULL;
-        if (!--bdev->bd_holders)
-                bdev->bd_holder = NULL;
-        spin_unlock(&bdev_lock);
 }
-EXPORT_SYMBOL(bd_release);
-#ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return 0;
        return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return;
        sysfs_remove_link(from, kobject_name(to));
 }
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-        struct list_head list;  /* chain of holders of the bdev */
-        int count;              /* references from the holder */
-        struct kobject *sdir;   /* holder object, e.g. "/block/dm-0/slaves" */
-        struct kobject *hdev;   /* e.g. "/block/dm-0" */
-        struct kobject *hdir;   /* e.g. "/block/sda/holders" */
-        struct kobject *sdev;   /* e.g. "/block/sda" */
-};
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-                        struct bd_holder *bo)
-{
-        if (!bdev || !bo)
-                return 0;
-        bo->sdir = kobject_get(bo->sdir);
-        if (!bo->sdir)
-                return 0;
-        bo->hdev = kobject_get(bo->sdir->parent);
-        if (!bo->hdev)
-                goto fail_put_sdir;
-        bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-        if (!bo->sdev)
-                goto fail_put_hdev;
-        bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-        if (!bo->hdir)
-                goto fail_put_sdev;
-        return 1;
-fail_put_sdev:
-        kobject_put(bo->sdev);
-fail_put_hdev:
-        kobject_put(bo->hdev);
-fail_put_sdir:
-        kobject_put(bo->sdir);
-        return 0;
-}
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-        kobject_put(bo->hdir);
-        kobject_put(bo->sdev);
-        kobject_put(bo->hdev);
-        kobject_put(bo->sdir);
-}
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-        if (!bo)
-                return NULL;
-        bo->count = 1;
-        bo->sdir = kobj;
-        return bo;
-}
-static void free_bd_holder(struct bd_holder *bo)
-{
-        kfree(bo);
-}
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       struct block device to be searched
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @bo:         target struct bd_holder
 *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
+ * This functions creates the following sysfs symlinks.
- * If found, increment the reference count and return the pointer.
+ *
- * If not found, returns NULL.
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
- */
+ * - from "holders" directory of the @bdev to the holder @disk
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
+ *
-                                        struct bd_holder *bo)
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
-{
+ * passed to bd_link_disk_holder(), then:
-        struct bd_holder *tmp;
+ *
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
-        list_for_each_entry(tmp, &bdev->bd_holder_list, list)
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
-                if (tmp->sdir == bo->sdir) {
-                        tmp->count++;
-                        return tmp;
-                }
-        return NULL;
-}
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
 *
- * @bdev:       block device to be bd_claimed
+ * The caller must have claimed @bdev before calling this function and
- * @bo:         preallocated and initialized by alloc_bd_holder()
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
 *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ * CONTEXT:
+ * Might sleep.
 *
- * Returns 0 if symlinks are created.
+ * RETURNS:
- * Returns -ve if something fails.
+ * 0 on success, -errno on failure.
 */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-        int err;
+        struct bd_holder_disk *holder;
+        int ret = 0;
-        if (!bo)
+        mutex_lock(&bdev->bd_mutex);
-                return -EINVAL;
-        if (!bd_holder_grab_dirs(bdev, bo))
+        WARN_ON_ONCE(!bdev->bd_holder);
-                return -EBUSY;
-        err = add_symlink(bo->sdir, bo->sdev);
+        /* FIXME: remove the following once add_disk() handles errors */
-        if (err)
+        if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
-                return err;
+                goto out_unlock;
-        err = add_symlink(bo->hdir, bo->hdev);
+        holder = bd_find_holder_disk(bdev, disk);
-        if (err) {
+        if (holder) {
-                del_symlink(bo->sdir, bo->sdev);
+                holder->refcnt++;
-                return err;
+                goto out_unlock;
        }
-        list_add_tail(&bo->list, &bdev->bd_holder_list);
+        holder = kzalloc(sizeof(*holder), GFP_KERNEL);
-        return 0;
+        if (!holder) {
-}
+                ret = -ENOMEM;
+                goto out_unlock;
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:       block device to be bd_claimed
- * @kobj:       holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-                                        struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-                if (bo->sdir == kobj) {
-                        bo->count--;
-                        BUG_ON(bo->count < 0);
-                        if (!bo->count) {
-                                list_del(&bo->list);
-                                del_symlink(bo->sdir, bo->sdev);
-                                del_symlink(bo->hdir, bo->hdev);
-                                bd_holder_release_dirs(bo);
-                                return bo;
-                        }
-                        break;
-                }
        }
-        return NULL;
+        INIT_LIST_HEAD(&holder->list);
-}
+        holder->disk = disk;
+        holder->refcnt = 1;
-/**
+        ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
+        if (ret)
- *
+                goto out_free;
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @kobj:       holder's kobject
- *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
- *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
- */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-                                struct kobject *kobj)
-{
-        int err;
-        struct bd_holder *bo, *found;
-        if (!kobj)
+        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
-                return -EINVAL;
+        if (ret)
+                goto out_del;
-        bo = alloc_bd_holder(kobj);
+        list_add(&holder->list, &bdev->bd_holder_disks);
-        if (!bo)
+        goto out_unlock;
-                return -ENOMEM;
-        mutex_lock(&bdev->bd_mutex);
+out_del:
+        del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-        err = bd_claim(bdev, holder);
+out_free:
-        if (err)
+        kfree(holder);
-                goto fail;
+out_unlock:
-        found = find_bd_holder(bdev, bo);
-        if (found)
-                goto fail;
-        err = add_bd_holder(bdev, bo);
-        if (err)
-                bd_release(bdev);
-        else
-                bo = NULL;
-fail:
        mutex_unlock(&bdev->bd_mutex);
-        free_bd_holder(bo);
+        return ret;
-        return err;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 /**
- * bd_release_from_kobject - bd_release() with additional kobject signature
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       block device to be released
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @kobj:       holder's kobject
 *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ * CONTEXT:
+ * Might sleep.
 */
-static void bd_release_from_kobject(struct block_device *bdev,
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
-                                        struct kobject *kobj)
 {
-        if (!kobj)
+        struct bd_holder_disk *holder;
-                return;
        mutex_lock(&bdev->bd_mutex);
-        bd_release(bdev);
-        free_bd_holder(del_bd_holder(bdev, kobj));
-        mutex_unlock(&bdev->bd_mutex);
-}
-/**
+        holder = bd_find_holder_disk(bdev, disk);
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @disk:       holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-                        struct gendisk *disk)
-{
-        return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
-/**
+        if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
+                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- *
+                del_symlink(bdev->bd_part->holder_dir,
- * @bdev:       block device to be claimed
+                            &disk_to_dev(disk)->kobj);
- * @disk:       holder's gendisk
+                list_del_init(&holder->list);
- *
+                kfree(holder);
- * Call bd_release_from_kobject() and put @disk->slave_dir.
+        }
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-        bd_release_from_kobject(bdev, disk->slave_dir);
-        kobject_put(disk->slave_dir);
-}
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
-#endif
-/*
+        mutex_unlock(&bdev->bd_mutex);
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
-{
-        struct block_device *bdev = bdget(dev);
-        int err = -ENOMEM;
-        if (bdev)
-                err = blkdev_get(bdev, mode);
-        return err ? ERR_PTR(err) : bdev;
 }
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-EXPORT_SYMBOL(open_by_devnum);
+#endif
 /**
 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1303,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
 {
        struct gendisk *disk = bdev->bd_disk;
        const struct block_device_operations *bdops = disk->fops;
+        unsigned int events;
-        if (!bdops->media_changed)
+        events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
-                return 0;
+                                   DISK_EVENT_EJECT_REQUEST);
-        if (!bdops->media_changed(bdev->bd_disk))
+        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
        flush_disk(bdev);
@@ -1469,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        return ret;
 }
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-        return __blkdev_get(bdev, mode, 0);
+        struct block_device *whole = NULL;
+        int res;
+        WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+        if ((mode & FMODE_EXCL) && holder) {
+                whole = bd_start_claiming(bdev, holder);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
+        res = __blkdev_get(bdev, mode, 0);
+        /* __blkdev_get() may alter read only status, check it afterwards */
+        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                __blkdev_put(bdev, mode, 0);
+                res = -EACCES;
+        }
+        if (whole) {
+                /* finish claiming */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                if (!res) {
+                        BUG_ON(!bd_may_claim(bdev, whole, holder));
+                        /*
+                         * Note that for a whole device bd_holders
+                         * will be incremented twice, and bd_holder
+                         * will be set to bd_may_claim before being
+                         * set to holder
+                         */
+                        whole->bd_holders++;
+                        whole->bd_holder = bd_may_claim;
+                        bdev->bd_holders++;
+                        bdev->bd_holder = holder;
+                }
+                /* tell others that we're done */
+                BUG_ON(whole->bd_claiming != holder);
+                whole->bd_claiming = NULL;
+                wake_up_bit(&whole->bd_claiming, 0);
+                spin_unlock(&bdev_lock);
+                /*
+                 * Block event polling for write claims.  Any write
+                 * holder makes the write_holder state stick until all
+                 * are released.  This is good enough and tracking
+                 * individual writeable reference is too fragile given
+                 * the way @mode is used in blkdev_get/put().
+                 */
+                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                        bdev->bd_write_holder = true;
+                        disk_block_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+                bdput(whole);
+        }
+        return res;
 }
 EXPORT_SYMBOL(blkdev_get);
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+                                        void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = lookup_bdev(path);
+        if (IS_ERR(bdev))
+                return bdev;
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = bdget(dev);
+        if (!bdev)
+                return ERR_PTR(-ENOMEM);
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-        struct block_device *whole = NULL;
        struct block_device *bdev;
-        int res;
        /*
         * Preserve backwards compatibility and allow large file access
@@ -1500,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
-        if (filp->f_mode & FMODE_EXCL) {
-                whole = bd_start_claiming(bdev, filp);
-                if (IS_ERR(whole)) {
-                        bdput(bdev);
-                        return PTR_ERR(whole);
-                }
-        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
-        res = blkdev_get(bdev, filp->f_mode);
+        return blkdev_get(bdev, filp->f_mode, filp);
-        if (whole) {
-                if (res == 0)
-                        bd_finish_claiming(bdev, whole, filp);
-                else
-                        bd_abort_claiming(whole, filp);
-        }
-        return res;
 }
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1533,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_part_count--;
        if (!--bdev->bd_openers) {
+                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
        }
@@ -1563,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+        if (mode & FMODE_EXCL) {
+                bool bdev_free;
+                /*
+                 * Release a claim on the device.  The holder fields
+                 * are protected with bdev_lock.  bd_mutex is to
+                 * synchronize disk_holder unlinking.
+                 */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                WARN_ON_ONCE(--bdev->bd_holders < 0);
+                WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+                /* bd_contains might point to self, check in a separate step */
+                if ((bdev_free = !bdev->bd_holders))
+                        bdev->bd_holder = NULL;
+                if (!bdev->bd_contains->bd_holders)
+                        bdev->bd_contains->bd_holder = NULL;
+                spin_unlock(&bdev_lock);
+                /*
+                 * If this was the last claim, remove holder link and
+                 * unblock evpoll if it was a write holder.
+                 */
+                if (bdev_free) {
+                        if (bdev->bd_write_holder) {
+                                disk_unblock_events(bdev->bd_disk);
+                                bdev->bd_write_holder = false;
+                        } else
+                                disk_check_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+        } else
+                disk_check_events(bdev->bd_disk);
        return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1570,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-        if (bdev->bd_holder == filp)
-                bd_release(bdev);
        return blkdev_put(bdev, filp->f_mode);
 }
@@ -1716,67 +1601,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:       special file representing the block device
- * @mode:       FMODE_... combination to pass be used
- * @holder:     owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-        struct block_device *bdev, *whole;
-        int error;
-        bdev = lookup_bdev(path);
-        if (IS_ERR(bdev))
-                return bdev;
-        whole = bd_start_claiming(bdev, holder);
-        if (IS_ERR(whole)) {
-                bdput(bdev);
-                return whole;
-        }
-        error = blkdev_get(bdev, mode);
-        if (error)
-                goto out_abort_claiming;
-        error = -EACCES;
-        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto out_blkdev_put;
-        bd_finish_claiming(bdev, whole, holder);
-        return bdev;
-out_blkdev_put:
-        blkdev_put(bdev, mode);
-out_abort_claiming:
-        bd_abort_claiming(whole, holder);
-        return ERR_PTR(error);
-}
-EXPORT_SYMBOL(open_bdev_exclusive);
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:       blockdevice to close
- * @mode:       mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-        bd_release(bdev);
-        blkdev_put(bdev, mode);
-}
-EXPORT_SYMBOL(close_bdev_exclusive);
 int __invalidate_device(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e57..ecb9fd3be14 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32f..31610ea73ae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b..15b5ca2a260 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                        if (IS_ERR(acl))
+                        if (IS_ERR(acl)) {
+                                kfree(value);
                                return acl;
+                        }
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@ -185,18 +187,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return ret;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl;
        int error = -EAGAIN;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        error = -ECHILD;
-        if (IS_ERR(acl))
+        } else {
-                return PTR_ERR(acl);
+                struct posix_acl *acl;
-        if (acl) {
+                acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-                error = posix_acl_permission(inode, acl, mask);
+                if (IS_ERR(acl))
-                posix_acl_release(acl);
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                }
        }
        return error;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca..ccc991c542d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
        /*
         * always compress this one file
         */
-        unsigned force_compress:1;
+        unsigned force_compress:4;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d..f745287fbf2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
        /* number of bytes on disk */
        unsigned long compressed_len;
+        /* the compression algorithm for this bio */
+        int compress_type;
        /* number of compressed pages in the array */
        unsigned long nr_pages;
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
                                        u64 first_byte, gfp_t gfp_flags)
 {
-        struct bio *bio;
        int nr_vecs;
        nr_vecs = bio_get_nr_vecs(bdev);
-        bio = bio_alloc(gfp_flags, nr_vecs);
+        return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
-        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-                while (!bio && (nr_vecs /= 2))
-                        bio = bio_alloc(gfp_flags, nr_vecs);
-        }
-        if (bio) {
-                bio->bi_size = 0;
-                bio->bi_bdev = bdev;
-                bio->bi_sector = first_byte >> 9;
-        }
-        return bio;
 }
 static int check_compressed_csum(struct inode *inode,
@@ -186,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
        /* ok, we're the last bio for this extent, lets start
         * the decompression.
         */
-        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+        ret = btrfs_decompress_biovec(cb->compress_type,
-                                        cb->start,
+                                      cb->compressed_pages,
-                                        cb->orig_bio->bi_io_vec,
+                                      cb->start,
-                                        cb->orig_bio->bi_vcnt,
+                                      cb->orig_bio->bi_io_vec,
-                                        cb->compressed_len);
+                                      cb->orig_bio->bi_vcnt,
+                                      cb->compressed_len);
 csum_failed:
        if (ret)
                cb->errors = 1;
@@ -601,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        cb->compressed_len = compressed_len;
+        cb->compress_type = extent_compress_type(bio_flags);
        cb->orig_bio = bio;
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -690,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bio_put(comp_bio);
        return 0;
 }
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+struct btrfs_compress_op *btrfs_compress_op[] = {
+        &btrfs_zlib_compress,
+        &btrfs_lzo_compress,
+};
+int __init btrfs_init_compress(void)
+{
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                INIT_LIST_HEAD(&comp_idle_workspace[i]);
+                spin_lock_init(&comp_workspace_lock[i]);
+                atomic_set(&comp_alloc_workspace[i], 0);
+                init_waitqueue_head(&comp_workspace_wait[i]);
+        }
+        return 0;
+}
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+        struct list_head *workspace;
+        int cpus = num_online_cpus();
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+again:
+        spin_lock(workspace_lock);
+        if (!list_empty(idle_workspace)) {
+                workspace = idle_workspace->next;
+                list_del(workspace);
+                (*num_workspace)--;
+                spin_unlock(workspace_lock);
+                return workspace;
+        }
+        if (atomic_read(alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                spin_unlock(workspace_lock);
+                prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+                        schedule();
+                finish_wait(workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(alloc_workspace);
+        spin_unlock(workspace_lock);
+        workspace = btrfs_compress_op[idx]->alloc_workspace();
+        if (IS_ERR(workspace)) {
+                atomic_dec(alloc_workspace);
+                wake_up(workspace_wait);
+        }
+        return workspace;
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+        spin_lock(workspace_lock);
+        if (*num_workspace < num_online_cpus()) {
+                list_add_tail(workspace, idle_workspace);
+                (*num_workspace)++;
+                spin_unlock(workspace_lock);
+                goto wake;
+        }
+        spin_unlock(workspace_lock);
+        btrfs_compress_op[idx]->free_workspace(workspace);
+        atomic_dec(alloc_workspace);
+wake:
+        if (waitqueue_active(workspace_wait))
+                wake_up(workspace_wait);
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct list_head *workspace;
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                while (!list_empty(&comp_idle_workspace[i])) {
+                        workspace = comp_idle_workspace[i].next;
+                        list_del(workspace);
+                        btrfs_compress_op[i]->free_workspace(workspace);
+                        atomic_dec(&comp_alloc_workspace[i]);
+                }
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+                         u64 start, unsigned long len,
+                         struct page **pages,
+                         unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
+                         unsigned long *total_in,
+                         unsigned long *total_out,
+                         unsigned long max_out)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -1;
+        ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+                                                      start, len, pages,
+                                                      nr_dest_pages, out_pages,
+                                                      total_in, total_out,
+                                                      max_out);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+                            struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+                                                         disk_start,
+                                                         bvec, vcnt, srclen);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+                     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+                                                  dest_page, start_byte,
+                                                  srclen, destlen);
+        free_workspace(type, workspace);
+        return ret;
+}
+void __exit btrfs_exit_compress(void)
+{
+        free_workspaces();
+}
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+                              unsigned long total_out, u64 disk_start,
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *page_index,
+                              unsigned long *pg_offset)
+{
+        unsigned long buf_offset;
+        unsigned long current_buf_start;
+        unsigned long start_byte;
+        unsigned long working_bytes = total_out - buf_start;
+        unsigned long bytes;
+        char *kaddr;
+        struct page *page_out = bvec[*page_index].bv_page;
+        /*
+         * start byte is the first byte of the page we're currently
+         * copying into relative to the start of the compressed data.
+         */
+        start_byte = page_offset(page_out) - disk_start;
+        /* we haven't yet hit data corresponding to this page */
+        if (total_out <= start_byte)
+                return 1;
+        /*
+         * the start of the data we care about is offset into
+         * the middle of our working buffer
+         */
+        if (total_out > start_byte && buf_start < start_byte) {
+                buf_offset = start_byte - buf_start;
+                working_bytes -= buf_offset;
+        } else {
+                buf_offset = 0;
+        }
+        current_buf_start = buf_start;
+        /* copy bytes from the working buffer into the pages */
+        while (working_bytes > 0) {
+                bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, working_bytes);
+                kaddr = kmap_atomic(page_out, KM_USER0);
+                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                flush_dcache_page(page_out);
+                *pg_offset += bytes;
+                buf_offset += bytes;
+                working_bytes -= bytes;
+                current_buf_start += bytes;
+                /* check if we need to pick another page */
+                if (*pg_offset == PAGE_CACHE_SIZE) {
+                        (*page_index)++;
+                        if (*page_index >= vcnt)
+                                return 0;
+                        page_out = bvec[*page_index].bv_page;
+                        *pg_offset = 0;
+                        start_byte = page_offset(page_out) - disk_start;
+                        /*
+                         * make sure our new page is covered by this
+                         * working buffer
+                         */
+                        if (total_out <= start_byte)
+                                return 1;
+                        /*
+                         * the next page in the biovec might not be adjacent
+                         * to the last page, but it might still be found
+                         * inside this working buffer. bump our offset pointer
+                         */
+                        if (total_out > start_byte &&
+                            current_buf_start < start_byte) {
+                                buf_offset = start_byte - buf_start;
+                                working_bytes = total_out - start_byte;
+                                current_buf_start = buf_start + buf_offset;
+                        }
+                }
+        }
+        return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa71..51000174b9d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
-int btrfs_zlib_decompress(unsigned char *data_in,
+int btrfs_init_compress(void);
-                          struct page *dest_page,
+void btrfs_exit_compress(void);
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen);
+int btrfs_compress_pages(int type, struct address_space *mapping,
-int btrfs_zlib_compress_pages(struct address_space *mapping,
+                         u64 start, unsigned long len,
-                              u64 start, unsigned long len,
+                         struct page **pages,
-                              struct page **pages,
+                         unsigned long nr_dest_pages,
-                              unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
-                              unsigned long *out_pages,
+                         unsigned long *total_in,
-                              unsigned long *total_in,
+                         unsigned long *total_out,
-                              unsigned long *total_out,
+                         unsigned long max_out);
-                              unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                            struct bio_vec *bvec, int vcnt, size_t srclen);
-                              u64 disk_start,
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
-                              struct bio_vec *bvec,
+                     unsigned long start_byte, size_t srclen, size_t destlen);
-                              int vcnt,
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
-                              size_t srclen);
+                              unsigned long total_out, u64 disk_start,
-void btrfs_zlib_exit(void);
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *page_index,
+                              unsigned long *pg_offset);
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long len, u64 disk_start,
                                  unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
+struct btrfs_compress_op {
+        struct list_head *(*alloc_workspace)(void);
+        void (*free_workspace)(struct list_head *workspace);
+        int (*compress_pages)(struct list_head *workspace,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+        int (*decompress_biovec)(struct list_head *workspace,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen);
+        int (*decompress)(struct list_head *workspace,
+                          unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+};
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac17159925..b5baff0dccf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
+        if (!p)
+                return;
        btrfs_release_path(NULL, p);
        kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        right = read_node_slot(root, upper, slot + 1);
+        if (right == NULL)
+                return 1;
        btrfs_tree_lock(right);
        btrfs_set_lock_blocking(right);
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        left = read_node_slot(root, path->nodes[1], slot - 1);
+        if (left == NULL)
+                return 1;
        btrfs_tree_lock(left);
        btrfs_set_lock_blocking(left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b4..2c98b3af605 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+/*
+ * File system states
+ */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR          (1ULL << 2)
 #define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
@@ -398,13 +407,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS     (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO     (1ULL << 3)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                     \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
-         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 enum btrfs_compression_type {
-        BTRFS_COMPRESS_NONE = 0,
+        BTRFS_COMPRESS_NONE  = 0,
-        BTRFS_COMPRESS_ZLIB = 1,
+        BTRFS_COMPRESS_ZLIB  = 1,
-        BTRFS_COMPRESS_LAST = 2,
+        BTRFS_COMPRESS_LZO   = 2,
+        BTRFS_COMPRESS_TYPES = 2,
+        BTRFS_COMPRESS_LAST  = 3,
 };
 struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
        u8 type;
 } __attribute__ ((__packed__));
+#define BTRFS_ROOT_SUBVOL_RDONLY        (1ULL << 0)
 struct btrfs_root_item {
        struct btrfs_inode_item inode;
        __le64 generation;
@@ -808,9 +823,9 @@ struct btrfs_block_group_cache {
        int extents_thresh;
        int free_extents;
        int total_bitmaps;
-        int ro:1;
+        unsigned int ro:1;
-        int dirty:1;
+        unsigned int dirty:1;
-        int iref:1;
+        unsigned int iref:1;
        int disk_cache_state;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
         */
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
-        unsigned long mount_opt;
+        unsigned long mount_opt:20;
+        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
        unsigned metadata_ratio;
        void *bdev_holder;
+        /* filesystem state */
+        u64 fs_state;
 };
 /*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2541,10 +2572,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno);
+#define btrfs_std_error(fs_info, errno)                         \
+do {                                                            \
+        if ((errno))                                            \
+                __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d718..b531c36455d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
 #include <linux/slab.h>
+#include <linux/migrate.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -43,6 +44,20 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
@@ -352,9 +367,15 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        WARN_ON(len == 0);
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        if (eb == NULL) {
+                WARN_ON(1);
+                goto out;
+        }
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
+        WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
@@ -424,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        WARN_ON(len == 0);
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        if (eb == NULL) {
+                ret = -EIO;
+                goto out;
+        }
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
@@ -693,6 +718,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                   __btree_submit_bio_done);
 }
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        /*
+         * we can't safely write a btree page from here,
+         * we haven't done the locking hook
+         */
+        if (PageDirty(page))
+                return -EAGAIN;
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_private(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+#endif
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
@@ -707,8 +753,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
        }
        redirty_page_for_writepage(wbc, page);
-        eb = btrfs_find_tree_block(root, page_offset(page),
+        eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-                                      PAGE_CACHE_SIZE);
        WARN_ON(!eb);
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +844,9 @@ static const struct address_space_operations btree_aops = {
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
        .sync_page      = block_sync_page,
+#ifdef CONFIG_MIGRATION
+        .migratepage    = btree_migratepage,
+#endif
 };
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1029,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-        BUG_ON(!root->node);
+        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+                free_extent_buffer(root->node);
+                return -EIO;
+        }
        root->commit_root = btrfs_root_node(root);
        return 0;
 }
@@ -1116,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        }
        btrfs_free_path(path);
        if (ret) {
+                kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
                return ERR_PTR(ret);
@@ -1538,10 +1590,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                                 GFP_NOFS);
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
                                                 GFP_NOFS);
-        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *tree_root = btrfs_sb(sb);
-                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
-        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
-                                                GFP_NOFS);
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
                                                GFP_NOFS);
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1686,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-        if (!bh)
+        if (!bh) {
+                err = -EINVAL;
                goto fail_iput;
+        }
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1700,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!btrfs_super_root(disk_super))
                goto fail_iput;
+        /* check FS state, whether FS is broken. */
+        fs_info->fs_state |= btrfs_super_flags(disk_super);
+        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
@@ -1717,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        features = btrfs_super_incompat_flags(disk_super);
-        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
-                btrfs_set_super_incompat_flags(disk_super, features);
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-        }
+        btrfs_set_super_incompat_flags(disk_super, features);
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1930,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
-        if (btrfs_super_log_root(disk_super) != 0) {
+        /* do not make disk changes in broken FS */
+        if (btrfs_super_log_root(disk_super) != 0 &&
+            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
                if (fs_devices->rw_devices == 0) {
@@ -2415,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
        smp_mb();
        btrfs_put_block_group_cache(fs_info);
+        /*
+         * Here come 2 situations when btrfs is broken to flip readonly:
+         *
+         * 1. when btrfs flips readonly somewhere else before
+         * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+         * and btrfs will skip to write sb directly to keep
+         * ERROR state on disk.
+         *
+         * 2. when btrfs flips readonly just in btrfs_commit_super,
+         * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+         * btrfs will cleanup all FS resources first and write sb then.
+         */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-                ret =  btrfs_commit_super(root);
+                ret = btrfs_commit_super(root);
+                if (ret)
+                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+        }
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                ret = btrfs_error_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
@@ -2592,6 +2671,352 @@ out:
        return 0;
 }
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                              int read_only)
+{
+        if (read_only)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                printk(KERN_WARNING "warning: mount fs with errors, "
+                       "running btrfsck is recommended\n");
+}
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+        int ret;
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_run_delayed_iputs(root);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        down_write(&root->fs_info->cleanup_work_sem);
+        up_write(&root->fs_info->cleanup_work_sem);
+        /* cleanup FS via transaction */
+        btrfs_cleanup_transaction(root);
+        ret = write_ctree_super(NULL, root, 0);
+        return ret;
+}
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                         ordered_operations);
+                list_del_init(&btrfs_inode->ordered_operations);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+        struct list_head splice;
+        struct btrfs_ordered_extent *ordered;
+        struct inode *inode;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        while (!list_empty(&splice)) {
+                ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                     root_extent_list);
+                list_del_init(&ordered->root_extent_list);
+                atomic_inc(&ordered->refs);
+                /* the inode may be getting freed (in sys_unlink path). */
+                inode = igrab(ordered->inode);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode)
+                        iput(inode);
+                atomic_set(&ordered->refs, 1);
+                btrfs_put_ordered_extent(ordered);
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root)
+{
+        struct rb_node *node;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        int ret = 0;
+        delayed_refs = &trans->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        if (delayed_refs->num_entries == 0) {
+                printk(KERN_INFO "delayed_refs has NO entry\n");
+                return ret;
+        }
+        node = rb_first(&delayed_refs->root);
+        while (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                node = rb_next(node);
+                ref->in_tree = 0;
+                rb_erase(&ref->rb_node, &delayed_refs->root);
+                delayed_refs->num_entries--;
+                atomic_set(&ref->refs, 1);
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        struct btrfs_delayed_ref_head *head;
+                        head = btrfs_delayed_node_to_head(ref);
+                        mutex_lock(&head->mutex);
+                        kfree(head->extent_op);
+                        delayed_refs->num_heads--;
+                        if (list_empty(&head->cluster))
+                                delayed_refs->num_heads_ready--;
+                        list_del_init(&head->cluster);
+                        mutex_unlock(&head->mutex);
+                }
+                spin_unlock(&delayed_refs->lock);
+                btrfs_put_delayed_ref(ref);
+                cond_resched();
+                spin_lock(&delayed_refs->lock);
+        }
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+        struct btrfs_pending_snapshot *snapshot;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        list_splice_init(&t->pending_snapshots, &splice);
+        while (!list_empty(&splice)) {
+                snapshot = list_entry(splice.next,
+                                      struct btrfs_pending_snapshot,
+                                      list);
+                list_del_init(&snapshot->list);
+                kfree(snapshot);
+        }
+        return 0;
+}
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                    delalloc_inodes);
+                list_del_init(&btrfs_inode->delalloc_inodes);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        return 0;
+}
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark)
+{
+        int ret;
+        struct page *page;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        u64 start = 0;
+        u64 end;
+        u64 offset;
+        unsigned long index;
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                            mark);
+                if (ret)
+                        break;
+                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+                while (start <= end) {
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        offset = page_offset(page);
+                        spin_lock(&dirty_pages->buffer_lock);
+                        eb = radix_tree_lookup(
+                             &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                               offset >> PAGE_CACHE_SHIFT);
+                        spin_unlock(&dirty_pages->buffer_lock);
+                        if (eb) {
+                                ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                         &eb->bflags);
+                                atomic_set(&eb->refs, 1);
+                        }
+                        if (PageWriteback(page))
+                                end_page_writeback(page);
+                        lock_page(page);
+                        if (PageDirty(page)) {
+                                clear_page_dirty_for_io(page);
+                                spin_lock_irq(&page->mapping->tree_lock);
+                                radix_tree_tag_clear(&page->mapping->page_tree,
+                                                        page_index(page),
+                                                        PAGECACHE_TAG_DIRTY);
+                                spin_unlock_irq(&page->mapping->tree_lock);
+                        }
+                        page->mapping->a_ops->invalidatepage(page, 0);
+                        unlock_page(page);
+                }
+        }
+        return ret;
+}
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents)
+{
+        struct extent_io_tree *unpin;
+        u64 start;
+        u64 end;
+        int ret;
+        unpin = pinned_extents;
+        while (1) {
+                ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                /* opt_discard */
+                ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+                clear_extent_dirty(unpin, start, end, GFP_NOFS);
+                btrfs_error_unpin_extent_range(root, start, end);
+                cond_resched();
+        }
+        return 0;
+}
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+        struct btrfs_transaction *t;
+        LIST_HEAD(list);
+        WARN_ON(1);
+        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        list_splice_init(&root->fs_info->trans_list, &list);
+        while (!list_empty(&list)) {
+                t = list_entry(list.next, struct btrfs_transaction, list);
+                if (!t)
+                        break;
+                btrfs_destroy_ordered_operations(root);
+                btrfs_destroy_ordered_extents(root);
+                btrfs_destroy_delayed_refs(t, root);
+                btrfs_block_rsv_release(root,
+                                        &root->fs_info->trans_block_rsv,
+                                        t->dirty_pages.dirty_bytes);
+                /* FIXME: cleanup wait for commit */
+                t->in_commit = 1;
+                t->blocked = 1;
+                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                        wake_up(&root->fs_info->transaction_blocked_wait);
+                t->blocked = 0;
+                if (waitqueue_active(&root->fs_info->transaction_wait))
+                        wake_up(&root->fs_info->transaction_wait);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                mutex_lock(&root->fs_info->trans_mutex);
+                t->commit_done = 1;
+                if (waitqueue_active(&t->commit_wait))
+                        wake_up(&t->commit_wait);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                mutex_lock(&root->fs_info->trans_mutex);
+                btrfs_destroy_pending_snapshots(t);
+                btrfs_destroy_delalloc_inodes(root);
+                spin_lock(&root->fs_info->new_trans_lock);
+                root->fs_info->running_transaction = NULL;
+                spin_unlock(&root->fs_info->new_trans_lock);
+                btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                             EXTENT_DIRTY);
+                btrfs_destroy_pinned_extent(root,
+                                            root->fs_info->pinned_extents);
+                t->use_count = 0;
+                list_del_init(&t->list);
+                memset(t, 0, sizeof(*t));
+                kmem_cache_free(btrfs_transaction_cachep, t);
+        }
+        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return 0;
+}
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf2..07b20dc2fd9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f..9786963b07e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
        struct btrfs_root *root;
-        struct dentry *dentry;
        struct inode *inode;
        struct btrfs_key key;
        int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                return ERR_PTR(-ESTALE);
        }
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
        return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        static struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -223,18 +218,91 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        btrfs_free_path(path);
        return ERR_PTR(ret);
 }
+static int btrfs_get_name(struct dentry *parent, char *name,
+                          struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        struct inode *dir = parent->d_inode;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_inode_ref *iref;
+        struct btrfs_root_ref *rref;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        int name_len;
+        int ret;
+        if (!dir || !inode)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode))
+                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+                key.type = BTRFS_ROOT_BACKREF_KEY;
+                key.offset = (u64)-1;
+                root = root->fs_info->tree_root;
+        } else {
+                key.objectid = inode->i_ino;
+                key.offset = dir->i_ino;
+                key.type = BTRFS_INODE_REF_KEY;
+        }
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        } else if (ret > 0) {
+                if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                        path->slots[0]--;
+                } else {
+                        btrfs_free_path(path);
+                        return -ENOENT;
+                }
+        }
+        leaf = path->nodes[0];
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               rref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_root_ref);
+               name_ptr = (unsigned long)(rref + 1);
+               name_len = btrfs_root_ref_name_len(leaf, rref);
+        } else {
+                iref = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_inode_ref);
+                name_ptr = (unsigned long)(iref + 1);
+                name_len = btrfs_inode_ref_name_len(leaf, iref);
+        }
+        read_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_free_path(path);
+        /*
+         * have to add the null termination to make sure that reconnect_path
+         * gets the right len for strlen
+         */
+        name[name_len] = '\0';
+        return 0;
+}
 const struct export_operations btrfs_export_ops = {
        .encode_fh      = btrfs_encode_fh,
        .fh_to_dentry   = btrfs_fh_to_dentry,
        .fh_to_parent   = btrfs_fh_to_parent,
        .get_parent     = btrfs_get_parent,
+        .get_name       = btrfs_get_name,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec4..b55269340ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
 static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             int load_cache_only)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        /*
         * We can't do the read from on-disk cache during a commit since we need
-         * to have the normal tree locking.
+         * to have the normal tree locking.  Also if we are currently trying to
+         * allocate blocks for the tree root we can't do the fast caching since
+         * we likely hold important locks.
         */
-        if (!trans->transaction->in_commit) {
+        if (!trans->transaction->in_commit &&
+            (root && root != root->fs_info->tree_root)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
                        spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct btrfs_root *root = block_group->fs_info->tree_root;
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
+        int dcs = BTRFS_DC_ERROR;
        int num_pages = 0;
        int retries = 0;
        int ret = 0;
@@ -2795,6 +2800,8 @@ again:
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+                /* We're not cached, don't bother trying to write stuff out */
+                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
@@ -2821,6 +2828,8 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+        if (!ret)
+                dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
        iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
        btrfs_release_path(root, path);
 out:
        spin_lock(&block_group->lock);
-        if (ret)
+        block_group->disk_cache_state = dcs;
-                block_group->disk_cache_state = BTRFS_DC_ERROR;
-        else
-                block_group->disk_cache_state = BTRFS_DC_SETUP;
        spin_unlock(&block_group->lock);
        return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        u64 num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3077,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
        return btrfs_reduce_alloc_profile(root, flags);
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
@@ -3149,8 +3161,12 @@ alloc:
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret < 0)
+                        if (ret < 0) {
-                                return ret;
+                                if (ret != -ENOSPC)
+                                        return ret;
+                                else
+                                        goto commit_trans;
+                        }
                        if (!data_sinfo) {
                                btrfs_set_inode_space_info(root, inode);
@@ -3161,6 +3177,7 @@ alloc:
                spin_unlock(&data_sinfo->lock);
                /* commit the current transaction and try again */
+commit_trans:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
@@ -3412,7 +3429,7 @@ again:
         * our reservation.
         */
        if (unused <= space_info->total_bytes) {
-                unused -= space_info->total_bytes;
+                unused = space_info->total_bytes - unused;
                if (unused >= num_bytes) {
                        if (!reserved)
                                space_info->bytes_reserved += orig_bytes;
@@ -3709,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                return 0;
        }
-        WARN_ON(1);
-        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-                block_rsv->size, block_rsv->reserved,
-                block_rsv->freed[0], block_rsv->freed[1]);
        return -ENOSPC;
 }
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                        cache_block_group(cache, trans, 1);
+                        cache_block_group(cache, trans, NULL, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
+                /*
+                 * this can happen if we end up cycling through all the
+                 * raid types, but we want to make sure we only allocate
+                 * for the proper type.
+                 */
+                if (!block_group_bits(block_group, data)) {
+                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10;
+                        /*
+                         * if they asked for extra copies and this block group
+                         * doesn't provide them, bail.  This does allow us to
+                         * fill raid0 from raid1.
+                         */
+                        if ((data & extra) && !(block_group->flags & extra))
+                                goto loop;
+                }
 have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
-                        ret = cache_block_group(block_group, trans, 1);
+                        ret = cache_block_group(block_group, trans,
+                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
                                goto have_block_group;
@@ -4958,7 +4990,8 @@ have_block_group:
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                                ret = cache_block_group(block_group, trans, 0);
+                                ret = cache_block_group(block_group, trans,
+                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group, trans, 0);
+        cache_block_group(block_group, trans, NULL, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                           NULL, NULL);
                BUG_ON(ret < 0);
                if (ret > 0) {
-                        ret = btrfs_del_orphan_item(trans, tree_root,
+                        /* if we fail to delete the orphan item this time
-                                                    root->root_key.objectid);
+                         * around, it'll get picked up the next time.
-                        BUG_ON(ret);
+                         *
+                         * The most common failure here is just -ENOENT.
+                         */
+                        btrfs_del_orphan_item(trans, tree_root,
+                                              root->root_key.objectid);
                }
        }
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-        num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@ -7926,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+            cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
@@ -7968,6 +8013,62 @@ out:
        return ret;
 }
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+        struct btrfs_block_group_cache *block_group;
+        u64 free_bytes = 0;
+        int factor;
+        list_for_each_entry(block_group, groups_list, list) {
+                spin_lock(&block_group->lock);
+                if (!block_group->ro) {
+                        spin_unlock(&block_group->lock);
+                        continue;
+                }
+                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                          BTRFS_BLOCK_GROUP_RAID10 |
+                                          BTRFS_BLOCK_GROUP_DUP))
+                        factor = 2;
+                else
+                        factor = 1;
+                free_bytes += (block_group->key.offset -
+                               btrfs_block_group_used(&block_group->item)) *
+                               factor;
+                spin_unlock(&block_group->lock);
+        }
+        return free_bytes;
+}
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+        int i;
+        u64 free_bytes = 0;
+        spin_lock(&sinfo->lock);
+        for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                if (!list_empty(&sinfo->block_groups[i]))
+                        free_bytes += __btrfs_get_ro_block_group_free_space(
+                                                &sinfo->block_groups[i]);
+        spin_unlock(&sinfo->lock);
+        return free_bytes;
+}
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
@@ -8048,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 min_free = btrfs_block_group_used(&block_group->item);
-                u64 dev_offset, max_avail;
+                u64 dev_offset;
                /*
                 * check to make sure we can actually find a chunk with enough
@@ -8056,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
                        ret = find_free_dev_extent(NULL, device, min_free,
-                                                   &dev_offset, &max_avail);
+                                                   &dev_offset, NULL);
                        if (!ret)
                                break;
                        ret = -1;
@@ -8247,7 +8348,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -8541,3 +8641,14 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+        return unpin_extent_range(root, start, end);
+}
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes)
+{
+        return btrfs_discard_extent(root, bytenr, num_bytes);
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a..2e993cf1766 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *
+struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-                 gfp_t gfp_flags)
+                gfp_t gfp_flags)
 {
        struct bio *bio;
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        else
                nr = bio_get_nr_vecs(bdev);
-        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
-                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&this_bio_flag,
+                                                 em->compress_type);
+                }
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@ -2901,21 +2904,53 @@ out:
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
-        int ret;
+        int ret = 0;
        u64 off = start;
        u64 max = start + len;
        u32 flags = 0;
+        u32 found_type;
+        u64 last;
        u64 disko = 0;
+        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *item;
        int end = 0;
        u64 em_start = 0, em_len = 0;
        unsigned long emflags;
-        ret = 0;
+        int hole = 0;
        if (len == 0)
                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+                                       path, inode->i_ino, -1, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        }
+        WARN_ON(!ret);
+        path->slots[0]--;
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        /* No extents, just return */
+        if (found_key.objectid != inode->i_ino ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        last = found_key.offset;
+        btrfs_free_path(path);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2960,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                ret = PTR_ERR(em);
                goto out;
        }
        while (!end) {
+                hole = 0;
                off = em->start + em->len;
                if (off >= max)
                        end = 1;
+                if (em->block_start == EXTENT_MAP_HOLE) {
+                        hole = 1;
+                        goto next;
+                }
                em_start = em->start;
                em_len = em->len;
@@ -2939,8 +2981,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2993,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
+next:
                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
                if (!end) {
                        em = get_extent(inode, NULL, 0, off, max - off, 0);
                        if (!em)
@@ -2967,15 +3007,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        }
                        emflags = em->flags;
                }
                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (em_start == last) {
-                                        em_len, flags);
+                        flags |= FIEMAP_EXTENT_LAST;
-                if (ret)
+                        end = 1;
-                        goto out_free;
+                }
+                if (!hole) {
+                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                                                em_len, flags);
+                        if (ret)
+                                goto out_free;
+                }
        }
 out_free:
        free_extent_map(em);
@@ -3027,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        if (eb == NULL)
+                return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
@@ -3836,8 +3886,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
        spin_lock(&tree->buffer_lock);
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (!eb)
+        if (!eb) {
-                goto out;
+                spin_unlock(&tree->buffer_lock);
+                return ret;
+        }
        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef..7083cfafd06 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
        wait_queue_head_t lock_wq;
 };
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+                                            int compress_type)
+{
+        *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+        return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
 struct extent_map_tree;
 static inline struct extent_state *extent_state_next(struct extent_state *state)
@@ -310,4 +325,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                struct extent_io_tree *tree,
                                u64 start, u64 end, struct page *locked_page,
                                unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                gfp_t gfp_flags);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff6..b0e1fce1253 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
                return em;
        em->in_tree = 0;
        em->flags = 0;
+        em->compress_type = BTRFS_COMPRESS_NONE;
        atomic_set(&em->refs, 1);
        return em;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e64..28b44dbd1e3 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
-        int in_tree;
+        unsigned int in_tree:1;
+        unsigned int compress_type:4;
 };
 struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df08..c800d58f301 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -48,30 +49,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
-        size_t copied;
+        size_t copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
+        int total_copied = 0;
        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
                struct page *page = prepared_pages[pg];
-again:
+                /*
-                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                 * Copy data from userspace to the current page
-                        return -EFAULT;
+                 *
+                 * Disable pagefault to avoid recursive lock since
-                /* Copy data from userspace to the current page */
+                 * the pages are already locked
-                copied = iov_iter_copy_from_user(page, i, offset, count);
+                 */
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
                iov_iter_advance(i, copied);
                write_bytes -= copied;
+                total_copied += copied;
+                /* Return to btrfs_file_aio_write to fault page */
                if (unlikely(copied == 0)) {
-                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                        break;
-                                      iov_iter_single_seg_count(i));
-                        goto again;
                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +86,7 @@ again:
                        offset = 0;
                }
        }
-        return 0;
+        return total_copied;
 }
 /*
@@ -220,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -234,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -854,6 +861,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        unsigned long last_index;
        int will_write;
        int buffered = 0;
+        int copied = 0;
+        int dirty_pages = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
@@ -884,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        if (err)
                goto out;
+        /*
+         * If BTRFS flips readonly due to some impossible error
+         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+         * although we have opened a file as writable, we have
+         * to stop this write operation to ensure FS consistency.
+         */
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                err = -EROFS;
+                goto out;
+        }
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
@@ -970,7 +990,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+                /*
+                 * Fault pages before locking them in prepare_pages
+                 * to avoid recursive lock
+                 */
+                if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                ret = btrfs_delalloc_reserve_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                if (ret)
                        goto out;
@@ -978,37 +1008,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
+                        btrfs_delalloc_release_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                        goto out;
                }
-                ret = btrfs_copy_from_user(pos, num_pages,
+                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                if (ret == 0) {
+                dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                if (num_pages > dirty_pages) {
+                        if (copied > 0)
+                                atomic_inc(
+                                        &BTRFS_I(inode)->outstanding_extents);
+                        btrfs_delalloc_release_space(inode,
+                                        (num_pages - dirty_pages) <<
+                                        PAGE_CACHE_SHIFT);
+                }
+                if (copied > 0) {
                        dirty_and_release_pages(NULL, root, file, pages,
-                                                num_pages, pos, write_bytes);
+                                                dirty_pages, pos, copied);
                }
                btrfs_drop_pages(pages, num_pages);
-                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
-                        goto out;
-                }
-                if (will_write) {
+                if (copied > 0) {
-                        filemap_fdatawrite_range(inode->i_mapping, pos,
+                        if (will_write) {
-                                                 pos + write_bytes - 1);
+                                filemap_fdatawrite_range(inode->i_mapping, pos,
-                } else {
+                                                         pos + copied - 1);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                        } else {
-                                                           num_pages);
+                                balance_dirty_pages_ratelimited_nr(
-                        if (num_pages <
+                                                        inode->i_mapping,
-                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                                        dirty_pages);
-                                btrfs_btree_balance_dirty(root, 1);
+                                if (dirty_pages <
-                        btrfs_throttle(root);
+                                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                        btrfs_btree_balance_dirty(root, 1);
+                                btrfs_throttle(root);
+                        }
                }
-                pos += write_bytes;
+                pos += copied;
-                num_written += write_bytes;
+                num_written += copied;
                cond_resched();
        }
@@ -1047,8 +1089,14 @@ out:
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        if (IS_ERR(trans)) {
+                                num_written = PTR_ERR(trans);
+                                goto done;
+                        }
+                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
+                        mutex_unlock(&inode->i_mutex);
                        if (ret == 0) {
                                ret = btrfs_sync_log(trans, root);
                                if (ret == 0)
@@ -1067,6 +1115,7 @@ out:
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
                }
        }
+done:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
@@ -1202,6 +1251,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1213,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b..60d68426695 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                       (unsigned long long)BTRFS_I(inode)->generation,
                       (unsigned long long)generation,
                       (unsigned long long)block_group->key.objectid);
-                goto out;
+                goto free_cache;
        }
        if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
+        node = rb_first(&block_group->free_space_offset);
+        if (!node) {
+                iput(inode);
+                return 0;
+        }
        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         */
        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-        node = rb_first(&block_group->free_space_offset);
-        if (!node)
-                goto out_free;
        /*
         * Lock all pages first so we can lock the extent safely.
         *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa5..160b55b3e13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-        int use_compress = 0;
+        int compress_type = BTRFS_COMPRESS_NONE;
        if (compressed_size && compressed_pages) {
-                use_compress = 1;
+                compress_type = root->fs_info->compress_type;
                cur_size = compressed_size;
        }
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);
-        if (use_compress) {
+        if (compress_type != BTRFS_COMPRESS_NONE) {
                struct page *cpage;
                int i = 0;
                while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        compressed_size -= cur_size;
                }
                btrfs_set_file_extent_compression(leaf, ei,
-                                                  BTRFS_COMPRESS_ZLIB);
+                                                  compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
        u64 compressed_size;
        struct page **pages;
        unsigned long nr_pages;
+        int compress_type;
        struct list_head list;
 };
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct page **pages,
-                                     unsigned long nr_pages)
+                                     unsigned long nr_pages,
+                                     int compress_type)
 {
        struct async_extent *async_extent;
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
        async_extent->compressed_size = compressed_size;
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
+        async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
 }
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
        unsigned long max_uncompressed = 128 * 1024;
        int i;
        int will_compress;
+        int compress_type = root->fs_info->compress_type;
        actual_end = min_t(u64, isize, end + 1);
 again:
@@ -381,12 +385,16 @@ again:
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                if (BTRFS_I(inode)->force_compress)
-                                                total_compressed, pages,
+                        compress_type = BTRFS_I(inode)->force_compress;
-                                                nr_pages, &nr_pages_ret,
-                                                &total_in,
+                ret = btrfs_compress_pages(compress_type,
-                                                &total_compressed,
+                                           inode->i_mapping, start,
-                                                max_compressed);
+                                           total_compressed, pages,
+                                           nr_pages, &nr_pages_ret,
+                                           &total_in,
+                                           &total_compressed,
+                                           max_compressed);
                if (!ret) {
                        unsigned long offset = total_compressed &
@@ -493,9 +501,10 @@ again:
                 * and will submit them to the elevator.
                 */
                add_async_extent(async_cow, start, num_bytes,
-                                 total_compressed, pages, nr_pages_ret);
+                                 total_compressed, pages, nr_pages_ret,
+                                 compress_type);
-                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                if (start + num_bytes < end) {
                        start += num_bytes;
                        pages = NULL;
                        cond_resched();
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
                        __set_page_dirty_nobuffers(locked_page);
                        /* unlocked later on in the async handlers */
                }
-                add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+                add_async_extent(async_cow, start, end - start + 1,
+                                 0, NULL, 0, BTRFS_COMPRESS_NONE);
                *num_added += 1;
        }
@@ -640,6 +650,7 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -656,11 +667,13 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
-                ret = btrfs_add_ordered_extent(inode, async_extent->start,
+                ret = btrfs_add_ordered_extent_compress(inode,
-                                               ins.objectid,
+                                                async_extent->start,
-                                               async_extent->ram_size,
+                                                ins.objectid,
-                                               ins.offset,
+                                                async_extent->ram_size,
-                                               BTRFS_ORDERED_COMPRESSED);
+                                                ins.offset,
+                                                BTRFS_ORDERED_COMPRESSED,
+                                                async_extent->compress_type);
                BUG_ON(ret);
                /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
-        int compressed = 0;
+        int compress_type = 0;
        int ret;
        bool nolock = false;
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-                compressed = 1;
+                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-                BUG_ON(compressed);
+                BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                                compressed, 0, 0,
+                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        logical = em->block_start;
                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&failrec->bio_flags,
+                                                 em->compress_type);
                }
                failrec->logical = logical;
                free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        dentry->d_op = &btrfs_dentry_operations;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4127,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        return inode;
 }
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
        struct btrfs_root *root;
@@ -4501,6 +4518,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
+        inode->i_generation = BTRFS_I(inode)->generation;
        btrfs_set_inode_space_info(root, inode);
        if (mode & S_IFDIR)
@@ -4622,12 +4640,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 }
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-                            struct dentry *dentry, struct inode *inode,
+                            struct inode *dir, struct dentry *dentry,
-                            int backref, u64 index)
+                            struct inode *inode, int backref, u64 index)
 {
-        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        int err = btrfs_add_link(trans, dir, inode,
-                                 inode, dentry->d_name.name,
+                                 dentry->d_name.name, dentry->d_name.len,
-                                 dentry->d_name.len, backref, index);
+                                 backref, index);
        if (!err) {
                d_instantiate(dentry, inode);
                return 0;
@@ -4668,8 +4686,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -4682,7 +4699,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4730,10 +4747,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino,
+                                BTRFS_I(dir)->block_group, mode, &index);
-                                objectid, BTRFS_I(dir)->block_group, mode,
-                                &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_unlock;
@@ -4745,7 +4760,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4787,6 +4802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EPERM;
        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
@@ -4805,15 +4821,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
        if (err) {
                drop_inode = 1;
        } else {
+                struct dentry *parent = dget_parent(dentry);
                btrfs_update_inode_block_group(trans, dir);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
-                btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+                btrfs_log_new_name(trans, inode, NULL, parent);
+                dput(parent);
        }
        nr = trans->blocks_used;
@@ -4853,8 +4871,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
                                &index);
        if (IS_ERR(inode)) {
@@ -4877,9 +4894,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                goto out_fail;
-        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
-                                 inode, dentry->d_name.name,
+                             dentry->d_name.len, 0, index);
-                                 dentry->d_name.len, 0, index);
        if (err)
                goto out_fail;
@@ -4931,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
+        int compress_type;
        WARN_ON(pg_offset != 0);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
@@ -4942,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        read_extent_buffer(leaf, tmp, ptr, inline_size);
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+        ret = btrfs_decompress(compress_type, tmp, page,
-                                    inline_size, max_size);
+                               extent_offset, inline_size, max_size);
        if (ret) {
                char *kaddr = kmap_atomic(page, KM_USER0);
                unsigned long copy_size = min_t(u64,
@@ -4985,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
-        int compressed;
+        int compress_type;
 again:
        read_lock(&em_tree->lock);
@@ -5044,7 +5062,7 @@ again:
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-        compressed = btrfs_file_extent_compression(leaf, item);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
@@ -5090,8 +5108,9 @@ again:
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-                if (compressed) {
+                if (compress_type != BTRFS_COMPRESS_NONE) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
                        em->block_start = bytenr;
                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                         item);
@@ -5125,12 +5144,14 @@ again:
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                em->orig_start = EXTENT_MAP_INLINE;
-                if (compressed)
+                if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
+                }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                        if (btrfs_file_extent_compression(leaf, item) ==
+                        if (btrfs_file_extent_compression(leaf, item) !=
-                            BTRFS_COMPRESS_ZLIB) {
+                            BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
@@ -5535,13 +5556,21 @@ struct btrfs_dio_private {
        u64 bytes;
        u32 *csums;
        void *private;
+        /* number of bios pending for this dio */
+        atomic_t pending_bios;
+        /* IO errors */
+        int errors;
+        struct bio *orig_bio;
 };
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
+        struct btrfs_dio_private *dip = bio->bi_private;
        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct bio_vec *bvec = bio->bi_io_vec;
-        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
@@ -5595,15 +5624,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
        struct extent_state *cached_state = NULL;
+        u64 ordered_offset = dip->logical_offset;
+        u64 ordered_bytes = dip->bytes;
        int ret;
        if (err)
                goto out_done;
+again:
-        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-                                             dip->logical_offset, dip->bytes);
+                                                   &ordered_offset,
+                                                   ordered_bytes);
        if (!ret)
-                goto out_done;
+                goto out_test;
        BUG_ON(!ordered);
@@ -5663,8 +5695,20 @@ out_unlock:
 out:
        btrfs_delalloc_release_metadata(inode, ordered->len);
        btrfs_end_transaction(trans, root);
+        ordered_offset = ordered->file_offset + ordered->len;
        btrfs_put_ordered_extent(ordered);
        btrfs_put_ordered_extent(ordered);
+out_test:
+        /*
+         * our bio might span multiple ordered extents.  If we haven't
+         * completed the accounting for the whole dio, go back and try again
+         */
+        if (ordered_offset < dip->logical_offset + dip->bytes) {
+                ordered_bytes = dip->logical_offset + dip->bytes -
+                        ordered_offset;
+                goto again;
+        }
 out_done:
        bio->bi_private = dip->private;
@@ -5684,6 +5728,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
        return 0;
 }
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        if (err) {
+                printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+                      "sector %#Lx len %u err no %d\n",
+                      dip->inode->i_ino, bio->bi_rw,
+                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                dip->errors = 1;
+                /*
+                 * before atomic variable goto zero, we must make sure
+                 * dip->errors is perceived to be set.
+                 */
+                smp_mb__before_atomic_dec();
+        }
+        /* if there are more bios still pending for this dio, just exit */
+        if (!atomic_dec_and_test(&dip->pending_bios))
+                goto out;
+        if (dip->errors)
+                bio_io_error(dip->orig_bio);
+        else {
+                set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+                bio_endio(dip->orig_bio, 0);
+        }
+out:
+        bio_put(bio);
+}
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+                                       u64 first_sector, gfp_t gfp_flags)
+{
+        int nr_vecs = bio_get_nr_vecs(bdev);
+        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+                                         int rw, u64 file_offset, int skip_sum,
+                                         u32 *csums)
+{
+        int write = rw & REQ_WRITE;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   file_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                goto err;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          file_offset, csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+err:
+        bio_put(bio);
+        return ret;
+}
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+                                    int skip_sum)
+{
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct bio *bio;
+        struct bio *orig_bio = dip->orig_bio;
+        struct bio_vec *bvec = orig_bio->bi_io_vec;
+        u64 start_sector = orig_bio->bi_sector;
+        u64 file_offset = dip->logical_offset;
+        u64 submit_len = 0;
+        u64 map_length;
+        int nr_pages = 0;
+        u32 *csums = dip->csums;
+        int ret = 0;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
+        map_length = orig_bio->bi_size;
+        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                              &map_length, NULL, 0);
+        if (ret) {
+                bio_put(bio);
+                return -EIO;
+        }
+        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+                if (unlikely(map_length < submit_len + bvec->bv_len ||
+                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len)) {
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count. Otherwise, the dip might get freed
+                         * before we're done setting it up
+                         */
+                        atomic_inc(&dip->pending_bios);
+                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
+                                                     file_offset, skip_sum,
+                                                     csums);
+                        if (ret) {
+                                bio_put(bio);
+                                atomic_dec(&dip->pending_bios);
+                                goto out_err;
+                        }
+                        if (!skip_sum)
+                                csums = csums + nr_pages;
+                        start_sector += submit_len >> 9;
+                        file_offset += submit_len;
+                        submit_len = 0;
+                        nr_pages = 0;
+                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+                                                  start_sector, GFP_NOFS);
+                        if (!bio)
+                                goto out_err;
+                        bio->bi_private = dip;
+                        bio->bi_end_io = btrfs_end_dio_bio;
+                        map_length = orig_bio->bi_size;
+                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                                              &map_length, NULL, 0);
+                        if (ret) {
+                                bio_put(bio);
+                                goto out_err;
+                        }
+                } else {
+                        submit_len += bvec->bv_len;
+                        nr_pages ++;
+                        bvec++;
+                }
+        }
+        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+                                     csums);
+        if (!ret)
+                return 0;
+        bio_put(bio);
+out_err:
+        dip->errors = 1;
+        /*
+         * before atomic variable goto zero, we must
+         * make sure dip->errors is perceived to be set.
+         */
+        smp_mb__before_atomic_dec();
+        if (atomic_dec_and_test(&dip->pending_bios))
+                bio_io_error(dip->orig_bio);
+        /* bio_end_io() will handle error, so we needn't return it */
+        return 0;
+}
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                                loff_t file_offset)
 {
@@ -5723,36 +5937,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->disk_bytenr = (u64)bio->bi_sector << 9;
        bio->bi_private = dip;
+        dip->errors = 0;
+        dip->orig_bio = bio;
+        atomic_set(&dip->pending_bios, 0);
        if (write)
                bio->bi_end_io = btrfs_endio_direct_write;
        else
                bio->bi_end_io = btrfs_endio_direct_read;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
-        if (ret)
+        if (!ret)
-                goto out_err;
-        if (write && !skip_sum) {
-                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                   inode, rw, bio, 0, 0,
-                                   dip->logical_offset,
-                                   __btrfs_submit_bio_start_direct_io,
-                                   __btrfs_submit_bio_done);
-                if (ret)
-                        goto out_err;
                return;
-        } else if (!skip_sum)
-                btrfs_lookup_bio_sums_dio(root, inode, bio,
-                                          dip->logical_offset, dip->csums);
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
-        if (ret)
-                goto out_err;
-        return;
-out_err:
-        kfree(dip->csums);
-        kfree(dip);
 free_ordered:
        /*
         * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5956,7 @@ free_ordered:
         */
        if (write) {
                struct btrfs_ordered_extent *ordered;
-                ordered = btrfs_lookup_ordered_extent(inode,
+                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-                                                      dip->logical_offset);
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
@@ -6306,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
-        ei->force_compress = 0;
+        ei->force_compress = BTRFS_COMPRESS_NONE;
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -6322,6 +6517,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void btrfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
@@ -6391,7 +6593,7 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 int btrfs_drop_inode(struct inode *inode)
@@ -6607,8 +6809,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        BUG_ON(ret);
        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
-                btrfs_log_new_name(trans, old_inode, old_dir,
+                struct dentry *parent = dget_parent(new_dentry);
-                                   new_dentry->d_parent);
+                btrfs_log_new_name(trans, old_inode, old_dir, parent);
+                dput(parent);
                btrfs_end_log_trans(root);
        }
 out_fail:
@@ -6758,8 +6961,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
                                &index);
        err = PTR_ERR(inode);
@@ -6773,7 +6975,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -6844,6 +7046,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
+        u64 i_size;
        int ret = 0;
        bool own_trans = true;
@@ -6885,11 +7088,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                    (actual_len > inode->i_size) &&
                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size_write(inode, actual_len);
+                                i_size = actual_len;
                        else
-                                i_size_write(inode, cur_offset);
+                                i_size = cur_offset;
-                        i_size_write(inode, cur_offset);
+                        i_size_write(inode, i_size);
-                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+                        btrfs_ordered_update_i_size(inode, i_size, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
@@ -6919,118 +7122,20 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                           min_size, actual_len, alloc_hint, trans);
 }
-static long btrfs_fallocate(struct inode *inode, int mode,
-                            loff_t offset, loff_t len)
-{
-        struct extent_state *cached_state = NULL;
-        u64 cur_offset;
-        u64 last_byte;
-        u64 alloc_start;
-        u64 alloc_end;
-        u64 alloc_hint = 0;
-        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-        struct extent_map *em;
-        int ret;
-        alloc_start = offset & ~mask;
-        alloc_end =  (offset + len + mask) & ~mask;
-        /*
-         * wait for ordered IO before we have any locks.  We'll loop again
-         * below with the locks held.
-         */
-        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-        mutex_lock(&inode->i_mutex);
-        if (alloc_start > inode->i_size) {
-                ret = btrfs_cont_expand(inode, alloc_start);
-                if (ret)
-                        goto out;
-        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
-        locked_end = alloc_end - 1;
-        while (1) {
-                struct btrfs_ordered_extent *ordered;
-                /* the extent lock is ordered inside the running
-                 * transaction
-                 */
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                                 locked_end, 0, &cached_state, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                            alloc_end - 1);
-                if (ordered &&
-                    ordered->file_offset + ordered->len > alloc_start &&
-                    ordered->file_offset < alloc_end) {
-                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             alloc_start, locked_end,
-                                             &cached_state, GFP_NOFS);
-                        /*
-                         * we can't wait on the range with the transaction
-                         * running or with the extent lock held
-                         */
-                        btrfs_wait_ordered_range(inode, alloc_start,
-                                                 alloc_end - alloc_start);
-                } else {
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        break;
-                }
-        }
-        cur_offset = alloc_start;
-        while (1) {
-                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-                                      alloc_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
-                last_byte = min(extent_map_end(em), alloc_end);
-                last_byte = (last_byte + mask) & ~mask;
-                if (em->block_start == EXTENT_MAP_HOLE ||
-                    (cur_offset >= inode->i_size &&
-                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                        last_byte - cur_offset,
-                                                        1 << inode->i_blkbits,
-                                                        offset + len,
-                                                        &alloc_hint);
-                        if (ret < 0) {
-                                free_extent_map(em);
-                                break;
-                        }
-                }
-                free_extent_map(em);
-                cur_offset = last_byte;
-                if (cur_offset >= alloc_end) {
-                        ret = 0;
-                        break;
-                }
-        }
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
 static int btrfs_set_page_dirty(struct page *page)
 {
        return __set_page_dirty_nobuffers(page);
 }
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+                return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
-        return generic_permission(inode, mask, btrfs_check_acl);
+        return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7123,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
-        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
@@ -7139,6 +7243,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = btrfs_getattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3..a506a22b522 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        unsigned int flags, oldflags;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -233,7 +236,8 @@ static noinline int create_subvol(struct btrfs_root *root,
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        struct btrfs_root *new_root;
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct dentry *parent = dget_parent(dentry);
+        struct inode *dir;
        int ret;
        int err;
        u64 objectid;
@@ -242,8 +246,13 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
                                       0, &objectid);
-        if (ret)
+        if (ret) {
+                dput(parent);
                return ret;
+        }
+        dir = parent->d_inode;
        /*
         * 1 - inode item
         * 2 - refs
@@ -251,8 +260,10 @@ static noinline int create_subvol(struct btrfs_root *root,
         * 2 - dir items
         */
        trans = btrfs_start_transaction(root, 6);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                dput(parent);
                return PTR_ERR(trans);
+        }
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -339,6 +350,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
+        dput(parent);
        if (async_transid) {
                *async_transid = trans->transid;
                err = btrfs_commit_transaction_async(trans, root, 1);
@@ -351,9 +363,11 @@ fail:
 }
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-                           char *name, int namelen, u64 *async_transid)
+                           char *name, int namelen, u64 *async_transid,
+                           bool readonly)
 {
        struct inode *inode;
+        struct dentry *parent;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -368,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
+        pending_snapshot->readonly = readonly;
        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
        if (IS_ERR(trans)) {
@@ -396,7 +411,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_orphan_cleanup(pending_snapshot->snap);
-        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+        parent = dget_parent(dentry);
+        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+        dput(parent);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto fail;
@@ -497,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
-                                   u64 *async_transid)
+                                   u64 *async_transid, bool readonly)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -529,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
        if (snap_src) {
                error = create_snapshot(snap_src, dentry,
-                                        name, namelen, async_transid);
+                                        name, namelen, async_transid, readonly);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen, async_transid);
@@ -626,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct page *page;
+        struct btrfs_super_block *disk_super;
        unsigned long last_index;
        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
        unsigned long total_read = 0;
+        u64 features;
        u64 page_start;
        u64 page_end;
        u64 last_len = 0;
@@ -636,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
        u64 defrag_end = 0;
        unsigned long i;
        int ret;
+        int compress_type = BTRFS_COMPRESS_ZLIB;
+        if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+                if (range->compress_type > BTRFS_COMPRESS_TYPES)
+                        return -EINVAL;
+                if (range->compress_type)
+                        compress_type = range->compress_type;
+        }
        if (inode->i_size == 0)
                return 0;
@@ -671,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
                total_read++;
                mutex_lock(&inode->i_mutex);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-                        BTRFS_I(inode)->force_compress = 1;
+                        BTRFS_I(inode)->force_compress = compress_type;
                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
                if (ret)
@@ -769,10 +796,17 @@ loop_unlock:
                atomic_dec(&root->fs_info->async_submit_draining);
                mutex_lock(&inode->i_mutex);
-                BTRFS_I(inode)->force_compress = 0;
+                BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
                mutex_unlock(&inode->i_mutex);
        }
+        disk_super = &root->fs_info->super_copy;
+        features = btrfs_super_incompat_flags(disk_super);
+        if (range->compress_type == BTRFS_COMPRESS_LZO) {
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
        return 0;
 err_reservations:
@@ -889,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                                                    char *name,
                                                    unsigned long fd,
                                                    int subvol,
-                                                    u64 *transid)
+                                                    u64 *transid,
+                                                    bool readonly)
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct file *src_file;
@@ -907,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
        if (subvol) {
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     NULL, transid);
+                                     NULL, transid, readonly);
        } else {
                struct inode *src_inode;
                src_file = fget(fd);
@@ -926,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                }
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                     BTRFS_I(src_inode)->root,
-                                     transid);
+                                     transid, readonly);
                fput(src_file);
        }
 out:
@@ -934,49 +969,142 @@ out:
 }
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-                                            void __user *arg, int subvol,
+                                            void __user *arg, int subvol)
-                                            int async)
 {
-        struct btrfs_ioctl_vol_args *vol_args = NULL;
+        struct btrfs_ioctl_vol_args *vol_args;
-        struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
-        char *name;
-        u64 fd;
-        u64 transid = 0;
        int ret;
-        if (async) {
+        vol_args = memdup_user(arg, sizeof(*vol_args));
-                async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+        if (IS_ERR(vol_args))
-                if (IS_ERR(async_vol_args))
+                return PTR_ERR(vol_args);
-                        return PTR_ERR(async_vol_args);
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-                name = async_vol_args->name;
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-                fd = async_vol_args->fd;
+                                              vol_args->fd, subvol,
-                async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+                                              NULL, false);
-        } else {
-                vol_args = memdup_user(arg, sizeof(*vol_args));
-                if (IS_ERR(vol_args))
-                        return PTR_ERR(vol_args);
-                name = vol_args->name;
-                fd = vol_args->fd;
-                vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        }
-        ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+        kfree(vol_args);
-                                              subvol, &transid);
+        return ret;
+}
-        if (!ret && async) {
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
-                if (copy_to_user(arg +
+                                               void __user *arg, int subvol)
-                                offsetof(struct btrfs_ioctl_async_vol_args,
+{
-                                transid), &transid, sizeof(transid)))
+        struct btrfs_ioctl_vol_args_v2 *vol_args;
-                        return -EFAULT;
+        int ret;
+        u64 transid = 0;
+        u64 *ptr = NULL;
+        bool readonly = false;
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
+        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+        if (vol_args->flags &
+            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+                ret = -EOPNOTSUPP;
+                goto out;
        }
+        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                ptr = &transid;
+        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+                readonly = true;
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                              vol_args->fd, subvol,
+                                              ptr, readonly);
+        if (ret == 0 && ptr &&
+            copy_to_user(arg +
+                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                  transid), ptr, sizeof(*ptr)))
+                ret = -EFAULT;
+out:
        kfree(vol_args);
-        kfree(async_vol_args);
+        return ret;
+}
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+                                                void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        u64 flags = 0;
+        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        down_read(&root->fs_info->subvol_sem);
+        if (btrfs_root_readonly(root))
+                flags |= BTRFS_SUBVOL_RDONLY;
+        up_read(&root->fs_info->subvol_sem);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                ret = -EFAULT;
        return ret;
 }
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+                                              void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 root_flags;
+        u64 flags;
+        int ret = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+                return -EINVAL;
+        if (flags & ~BTRFS_SUBVOL_RDONLY)
+                return -EOPNOTSUPP;
+        down_write(&root->fs_info->subvol_sem);
+        /* nothing to do */
+        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+                goto out;
+        root_flags = btrfs_root_flags(&root->root_item);
+        if (flags & BTRFS_SUBVOL_RDONLY)
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+        else
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+        trans = btrfs_start_transaction(root, 1);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_reset;
+        }
+        ret = btrfs_update_root(trans, root,
+                                &root->root_key, &root->root_item);
+        btrfs_commit_transaction(trans, root);
+out_reset:
+        if (ret)
+                btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+        up_write(&root->fs_info->subvol_sem);
+        return ret;
+}
 /*
 * helper to check if the subvolume references other subvolumes
 */
@@ -1485,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1613,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1669,12 +1803,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                olen = len = src->i_size - off;
        /* if we extend to eof, continue to block boundary */
        if (off + len == src->i_size)
-                len = ((src->i_size + bs-1) & ~(bs-1))
+                len = ALIGN(src->i_size, bs) - off;
-                        - off;
        /* verify the end result is block aligned */
-        if ((off & (bs-1)) ||
+        if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-            ((off + len) & (bs-1)))
+            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
        /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +2007,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                         * but shouldn't round up the file size
                         */
                        endoff = new_key.offset + datal;
-                        if (endoff > off+olen)
+                        if (endoff > destoff+olen)
-                                endoff = off+olen;
+                                endoff = destoff+olen;
                        if (endoff > inode->i_size)
                                btrfs_i_size_write(inode, endoff);
@@ -1935,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (file->private_data)
                goto out;
+        ret = -EROFS;
+        if (btrfs_root_readonly(root))
+                goto out;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                goto out;
@@ -2234,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 0, 0);
+                return btrfs_ioctl_snap_create(file, argp, 0);
-        case BTRFS_IOC_SNAP_CREATE_ASYNC:
+        case BTRFS_IOC_SNAP_CREATE_V2:
-                return btrfs_ioctl_snap_create(file, argp, 0, 1);
+                return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 1, 0);
+                return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
+        case BTRFS_IOC_SUBVOL_GETFLAGS:
+                return btrfs_ioctl_subvol_getflags(file, argp);
+        case BTRFS_IOC_SUBVOL_SETFLAGS:
+                return btrfs_ioctl_subvol_setflags(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf96..8fb382167b1 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,16 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
-#define BTRFS_SNAPSHOT_NAME_MAX 4079
+#define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
-struct btrfs_ioctl_async_vol_args {
+#define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
        __s64 fd;
        __u64 transid;
-        char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+        __u64 flags;
+        __u64 unused[4];
+        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -129,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
         */
        __u32 extent_thresh;
+        /*
+         * which compression method to use if turning on compression
+         * for this defrag operation.  If unspecified, zlib will
+         * be used
+         */
+        __u32 compress_type;
        /* spare for later */
-        __u32 unused[5];
+        __u32 unused[4];
 };
 struct btrfs_ioctl_space_info {
@@ -187,6 +199,8 @@ struct btrfs_ioctl_space_args {
                                    struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-                                   struct btrfs_ioctl_async_vol_args)
+                                   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 00000000000..cc9b450399d
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+#define LZO_LEN 4
+struct workspace {
+        void *mem;
+        void *buf;      /* where compressed data goes */
+        void *cbuf;     /* where decompressed data goes */
+        struct list_head list;
+};
+static void lzo_free_workspace(struct list_head *ws)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        vfree(workspace->buf);
+        vfree(workspace->cbuf);
+        vfree(workspace->mem);
+        kfree(workspace);
+}
+static struct list_head *lzo_alloc_workspace(void)
+{
+        struct workspace *workspace;
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace)
+                return ERR_PTR(-ENOMEM);
+        workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+        workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+                goto fail;
+        INIT_LIST_HEAD(&workspace->list);
+        return &workspace->list;
+fail:
+        lzo_free_workspace(&workspace->list);
+        return ERR_PTR(-ENOMEM);
+}
+static inline void write_compress_length(char *buf, size_t len)
+{
+        __le32 dlen;
+        dlen = cpu_to_le32(len);
+        memcpy(buf, &dlen, LZO_LEN);
+}
+static inline size_t read_compress_length(char *buf)
+{
+        __le32 dlen;
+        memcpy(&dlen, buf, LZO_LEN);
+        return le32_to_cpu(dlen);
+}
+static int lzo_compress_pages(struct list_head *ws,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        unsigned long bytes_left;
+        size_t in_len;
+        size_t out_len;
+        char *buf;
+        unsigned long tot_in = 0;
+        unsigned long tot_out = 0;
+        unsigned long pg_bytes_left;
+        unsigned long out_offset;
+        unsigned long bytes;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        /*
+         * store the size of all chunks of compressed data in
+         * the first 4 bytes
+         */
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        cpage_out = kmap(out_page);
+        out_offset = LZO_LEN;
+        tot_out = LZO_LEN;
+        pages[0] = out_page;
+        nr_pages = 1;
+        pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        /* compress at most one page of data each time */
+        in_len = min(len, PAGE_CACHE_SIZE);
+        while (tot_in < len) {
+                ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+                                       &out_len, workspace->mem);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        ret = -1;
+                        goto out;
+                }
+                /* store the size of this chunk of compressed data */
+                write_compress_length(cpage_out + out_offset, out_len);
+                tot_out += LZO_LEN;
+                out_offset += LZO_LEN;
+                pg_bytes_left -= LZO_LEN;
+                tot_in += in_len;
+                tot_out += out_len;
+                /* copy bytes from the working buffer into the pages */
+                buf = workspace->cbuf;
+                while (out_len) {
+                        bytes = min_t(unsigned long, pg_bytes_left, out_len);
+                        memcpy(cpage_out + out_offset, buf, bytes);
+                        out_len -= bytes;
+                        pg_bytes_left -= bytes;
+                        buf += bytes;
+                        out_offset += bytes;
+                        /*
+                         * we need another page for writing out.
+                         *
+                         * Note if there's less than 4 bytes left, we just
+                         * skip to a new page.
+                         */
+                        if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+                            pg_bytes_left == 0) {
+                                if (pg_bytes_left) {
+                                        memset(cpage_out + out_offset, 0,
+                                               pg_bytes_left);
+                                        tot_out += pg_bytes_left;
+                                }
+                                /* we're done, don't allocate new page */
+                                if (out_len == 0 && tot_in >= len)
+                                        break;
+                                kunmap(out_page);
+                                if (nr_pages == nr_dest_pages) {
+                                        out_page = NULL;
+                                        ret = -1;
+                                        goto out;
+                                }
+                                out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                                if (out_page == NULL) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                cpage_out = kmap(out_page);
+                                pages[nr_pages++] = out_page;
+                                pg_bytes_left = PAGE_CACHE_SIZE;
+                                out_offset = 0;
+                        }
+                }
+                /* we're making it bigger, give up */
+                if (tot_in > 8192 && tot_in < tot_out)
+                        goto out;
+                /* we're all done */
+                if (tot_in >= len)
+                        break;
+                if (tot_out > max_out)
+                        break;
+                bytes_left = len - tot_in;
+                kunmap(in_page);
+                page_cache_release(in_page);
+                start += PAGE_CACHE_SIZE;
+                in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+                data_in = kmap(in_page);
+                in_len = min(bytes_left, PAGE_CACHE_SIZE);
+        }
+        if (tot_out > tot_in)
+                goto out;
+        /* store the size of all chunks of compressed data */
+        cpage_out = kmap(pages[0]);
+        write_compress_length(cpage_out, tot_out);
+        kunmap(pages[0]);
+        ret = 0;
+        *total_out = tot_out;
+        *total_in = tot_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        return ret;
+}
+static int lzo_decompress_biovec(struct list_head *ws,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
+        char *data_in;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset = 0;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        size_t in_len;
+        size_t out_len;
+        unsigned long in_offset;
+        unsigned long in_page_bytes_left;
+        unsigned long tot_in;
+        unsigned long tot_out;
+        unsigned long tot_len;
+        char *buf;
+        data_in = kmap(pages_in[0]);
+        tot_len = read_compress_length(data_in);
+        tot_in = LZO_LEN;
+        in_offset = LZO_LEN;
+        tot_len = min_t(size_t, srclen, tot_len);
+        in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        tot_out = 0;
+        pg_offset = 0;
+        while (tot_in < tot_len) {
+                in_len = read_compress_length(data_in + in_offset);
+                in_page_bytes_left -= LZO_LEN;
+                in_offset += LZO_LEN;
+                tot_in += LZO_LEN;
+                tot_in += in_len;
+                working_bytes = in_len;
+                /* fast path: avoid using the working buffer */
+                if (in_page_bytes_left >= in_len) {
+                        buf = data_in + in_offset;
+                        bytes = in_len;
+                        goto cont;
+                }
+                /* copy bytes from the pages into the working buffer */
+                buf = workspace->cbuf;
+                buf_offset = 0;
+                while (working_bytes) {
+                        bytes = min(working_bytes, in_page_bytes_left);
+                        memcpy(buf + buf_offset, data_in + in_offset, bytes);
+                        buf_offset += bytes;
+cont:
+                        working_bytes -= bytes;
+                        in_page_bytes_left -= bytes;
+                        in_offset += bytes;
+                        /* check if we need to pick another page */
+                        if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+                            || in_page_bytes_left == 0) {
+                                tot_in += in_page_bytes_left;
+                                if (working_bytes == 0 && tot_in >= tot_len)
+                                        break;
+                                kunmap(pages_in[page_in_index]);
+                                page_in_index++;
+                                if (page_in_index >= total_pages_in) {
+                                        ret = -1;
+                                        data_in = NULL;
+                                        goto done;
+                                }
+                                data_in = kmap(pages_in[page_in_index]);
+                                in_page_bytes_left = PAGE_CACHE_SIZE;
+                                in_offset = 0;
+                        }
+                }
+                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+                                            &out_len);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_WARNING "btrfs decompress failed\n");
+                        ret = -1;
+                        break;
+                }
+                buf_start = tot_out;
+                tot_out += out_len;
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+                                                 tot_out, disk_start,
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
+                if (ret2 == 0)
+                        break;
+        }
+done:
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+        return ret;
+}
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        size_t in_len;
+        size_t out_len;
+        size_t tot_len;
+        int ret = 0;
+        char *kaddr;
+        unsigned long bytes;
+        BUG_ON(srclen < LZO_LEN);
+        tot_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        in_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        out_len = PAGE_CACHE_SIZE;
+        ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+        if (ret != LZO_E_OK) {
+                printk(KERN_WARNING "btrfs decompress failed!\n");
+                ret = -1;
+                goto out;
+        }
+        if (out_len < start_byte) {
+                ret = -1;
+                goto out;
+        }
+        bytes = min_t(unsigned long, destlen, out_len - start_byte);
+        kaddr = kmap_atomic(dest_page, KM_USER0);
+        memcpy(kaddr, workspace->buf + start_byte, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+out:
+        return ret;
+}
+struct btrfs_compress_op btrfs_lzo_compress = {
+        .alloc_workspace        = lzo_alloc_workspace,
+        .free_workspace         = lzo_free_workspace,
+        .compress_pages         = lzo_compress_pages,
+        .decompress_biovec      = lzo_decompress_biovec,
+        .decompress             = lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca..2b61e1ddcd9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
-                                      int type, int dio)
+                                      int type, int dio, int compress_type)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = inode;
+        entry->compress_type = compress_type;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 0);
+                                          disk_len, type, 0,
+                                          BTRFS_COMPRESS_NONE);
 }
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 1);
+                                          disk_len, type, 1,
+                                          BTRFS_COMPRESS_NONE);
+}
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0,
+                                          compress_type);
 }
 /*
@@ -250,6 +262,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
 /*
 * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        int ret;
+        u64 dec_end;
+        u64 dec_start;
+        u64 to_dec;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, *file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, *file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        dec_start = max(*file_offset, entry->file_offset);
+        dec_end = min(*file_offset + io_size, entry->file_offset +
+                      entry->len);
+        *file_offset = dec_end;
+        if (dec_start > dec_end) {
+                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+                       (unsigned long long)dec_start,
+                       (unsigned long long)dec_end);
+        }
+        to_dec = dec_end - dec_start;
+        if (to_dec > entry->bytes_left) {
+                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                       (unsigned long long)entry->bytes_left,
+                       (unsigned long long)to_dec);
+        }
+        entry->bytes_left -= to_dec;
+        if (entry->bytes_left == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+        else
+                ret = 1;
+out:
+        if (!ret && cached && entry) {
+                *cached = entry;
+                atomic_inc(&entry->refs);
+        }
+        spin_unlock(&tree->lock);
+        return ret == 0;
+}
+/*
+ * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3..ff1f69aa188 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
        /* flags (described above) */
        unsigned long flags;
+        /* compression algorithm */
+        int compress_type;
        /* reference count */
        atomic_t refs;
@@ -141,10 +144,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28..f8be250963a 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret)
+        if (ret < 0)
                goto out;
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
        ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8..b2130c46fdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
 static const struct super_operations btrfs_super_ops;
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                      char nbuf[16])
+{
+        char *errstr = NULL;
+        switch (errno) {
+        case -EIO:
+                errstr = "IO failure";
+                break;
+        case -ENOMEM:
+                errstr = "Out of memory";
+                break;
+        case -EROFS:
+                errstr = "Readonly filesystem";
+                break;
+        default:
+                if (nbuf) {
+                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                                errstr = nbuf;
+                }
+                break;
+        }
+        return errstr;
+}
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * today we only save the error info into ram.  Long term we'll
+         * also send it down to the disk
+         */
+        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+/* NOTE:
+ *      We move write_super stuff at umount in order to avoid deadlock
+ *      for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+        __save_error_info(fs_info);
+}
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+        struct super_block *sb = fs_info->sb;
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                sb->s_flags |= MS_RDONLY;
+                printk(KERN_INFO "btrfs is forced readonly\n");
+        }
+}
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno)
+{
+        struct super_block *sb = fs_info->sb;
+        char nbuf[16];
+        const char *errstr;
+        /*
+         * Special case: if the error is EROFS, and we're already
+         * under MS_RDONLY, then it is safe here.
+         */
+        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+                return;
+        errstr = btrfs_decode_error(fs_info, errno, nbuf);
+        printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+                sb->s_id, function, line, errstr);
+        save_error_info(fs_info);
+        btrfs_handle_error(fs_info);
+}
 static void btrfs_put_super(struct super_block *sb)
 {
        struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
-        Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_user_subvol_rm_allowed,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
 };
 static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
+        {Opt_compress_type, "compress=%s"},
        {Opt_compress_force, "compress-force"},
+        {Opt_compress_force_type, "compress-force=%s"},
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        char *p, *num, *orig;
        int intarg;
        int ret = 0;
+        char *compress_type;
+        bool compress_force = false;
        if (!options)
                return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
-                case Opt_compress:
-                        printk(KERN_INFO "btrfs: use compression\n");
-                        btrfs_set_opt(info->mount_opt, COMPRESS);
-                        break;
                case Opt_compress_force:
-                        printk(KERN_INFO "btrfs: forcing compression\n");
+                case Opt_compress_force_type:
-                        btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                        compress_force = true;
+                case Opt_compress:
+                case Opt_compress_type:
+                        if (token == Opt_compress ||
+                            token == Opt_compress_force ||
+                            strcmp(args[0].from, "zlib") == 0) {
+                                compress_type = "zlib";
+                                info->compress_type = BTRFS_COMPRESS_ZLIB;
+                        } else if (strcmp(args[0].from, "lzo") == 0) {
+                                compress_type = "lzo";
+                                info->compress_type = BTRFS_COMPRESS_LZO;
+                        } else {
+                                ret = -EINVAL;
+                                goto out;
+                        }
                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        if (compress_force) {
+                                btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                                pr_info("btrfs: force %s compression\n",
+                                        compress_type);
+                        } else
+                                pr_info("btrfs: use %s compression\n",
+                                        compress_type);
                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -244,6 +350,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_space_cache:
                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                        break;
                case Opt_clear_cache:
                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -459,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
+        sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
@@ -562,12 +670,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *test_root = data;
        struct btrfs_root *root = btrfs_sb(s);
-        return root->fs_info->fs_devices == test_fs_devices;
+        /*
+         * If this super block is going away, return false as it
+         * can't match as an existing super block.
+         */
+        if (!atomic_read(&s->s_active))
+                return 0;
+        return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
+}
 /*
 * Find a superblock for the given device / mount point.
 *
@@ -581,6 +703,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
+        struct btrfs_root *tree_root = NULL;
+        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
@@ -608,8 +732,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                goto error_close_devices;
        }
+        /*
+         * Setup a dummy root and fs_info for test/set super.  This is because
+         * we don't actually fill this stuff out until open_ctree, but we need
+         * it for searching for existing supers, so this lets us do that and
+         * then open_ctree will properly initialize everything later.
+         */
+        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info || !tree_root) {
+                error = -ENOMEM;
+                goto error_close_devices;
+        }
+        fs_info->tree_root = tree_root;
+        fs_info->fs_devices = fs_devices;
+        tree_root->fs_info = fs_info;
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
        if (IS_ERR(s))
                goto error_s;
@@ -652,9 +792,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                mutex_unlock(&root->d_inode->i_mutex);
                if (IS_ERR(new_root)) {
+                        dput(root);
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
-                        dput(root);
                        goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
@@ -675,6 +815,8 @@ error_s:
        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
+        kfree(fs_info);
+        kfree(tree_root);
 error_free_subvol_name:
        kfree(subvol_name);
        return ERR_PTR(error);
@@ -717,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device_info *devices_info;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *device;
+        u64 skip_space;
+        u64 type;
+        u64 avail_space;
+        u64 used_space;
+        u64 min_stripe_size;
+        int min_stripes = 1;
+        int i = 0, nr_devices;
+        int ret;
+        nr_devices = fs_info->fs_devices->rw_devices;
+        BUG_ON(!nr_devices);
+        devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                               GFP_NOFS);
+        if (!devices_info)
+                return -ENOMEM;
+        /* calc min stripe number for data space alloction */
+        type = btrfs_get_alloc_profile(root, 1);
+        if (type & BTRFS_BLOCK_GROUP_RAID0)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID1)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                min_stripes = 4;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+        else
+                min_stripe_size = BTRFS_STRIPE_LEN;
+        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+                if (!device->in_fs_metadata)
+                        continue;
+                avail_space = device->total_bytes - device->bytes_used;
+                /* align with stripe_len */
+                do_div(avail_space, BTRFS_STRIPE_LEN);
+                avail_space *= BTRFS_STRIPE_LEN;
+                /*
+                 * In order to avoid overwritting the superblock on the drive,
+                 * btrfs starts at an offset of at least 1MB when doing chunk
+                 * allocation.
+                 */
+                skip_space = 1024 * 1024;
+                /* user can set the offset in fs_info->alloc_start. */
+                if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                    device->total_bytes)
+                        skip_space = max(fs_info->alloc_start, skip_space);
+                /*
+                 * btrfs can not use the free space in [0, skip_space - 1],
+                 * we must subtract it from the total. In order to implement
+                 * it, we account the used space in this range first.
+                 */
+                ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                     &used_space);
+                if (ret) {
+                        kfree(devices_info);
+                        return ret;
+                }
+                /* calc the free space in [0, skip_space - 1] */
+                skip_space -= used_space;
+                /*
+                 * we can use the free space in [0, skip_space - 1], subtract
+                 * it from the total.
+                 */
+                if (avail_space && avail_space >= skip_space)
+                        avail_space -= skip_space;
+                else
+                        avail_space = 0;
+                if (avail_space < min_stripe_size)
+                        continue;
+                devices_info[i].dev = device;
+                devices_info[i].max_avail = avail_space;
+                i++;
+        }
+        nr_devices = i;
+        btrfs_descending_sort_devices(devices_info, nr_devices);
+        i = nr_devices - 1;
+        avail_space = 0;
+        while (nr_devices >= min_stripes) {
+                if (devices_info[i].max_avail >= min_stripe_size) {
+                        int j;
+                        u64 alloc_size;
+                        avail_space += devices_info[i].max_avail * min_stripes;
+                        alloc_size = devices_info[i].max_avail;
+                        for (j = i + 1 - min_stripes; j <= i; j++)
+                                devices_info[j].max_avail -= alloc_size;
+                }
+                i--;
+                nr_devices--;
+        }
+        kfree(devices_info);
+        *free_bytes = avail_space;
+        return 0;
+}
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -724,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 total_used_data = 0;
+        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        int ret;
+        /* holding chunk_muext to avoid allocating new chunks */
+        mutex_lock(&root->fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-                if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
-                                    BTRFS_BLOCK_GROUP_SYSTEM))
+                        total_free_data += found->disk_total - found->disk_used;
-                        total_used_data += found->disk_total;
+                        total_free_data -=
-                else
+                                btrfs_account_ro_block_groups_free_space(found);
-                        total_used_data += found->disk_used;
+                }
                total_used += found->disk_used;
        }
        rcu_read_unlock();
@@ -742,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
+        buf->f_bavail = total_free_data;
+        ret = btrfs_calc_avail_data_space(root, &total_free_data);
+        if (ret) {
+                mutex_unlock(&root->fs_info->chunk_mutex);
+                return ret;
+        }
+        buf->f_bavail += total_free_data;
+        buf->f_bavail = buf->f_bavail >> bits;
+        mutex_unlock(&root->fs_info->chunk_mutex);
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -861,10 +1136,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                return err;
-        err = btrfs_init_cachep();
+        err = btrfs_init_compress();
        if (err)
                goto free_sysfs;
+        err = btrfs_init_cachep();
+        if (err)
+                goto free_compress;
        err = extent_io_init();
        if (err)
                goto free_cachep;
@@ -892,6 +1171,8 @@ free_extent_io:
        extent_io_exit();
 free_cachep:
        btrfs_destroy_cachep();
+free_compress:
+        btrfs_exit_compress();
 free_sysfs:
        btrfs_exit_sysfs();
        return err;
@@ -906,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
-        btrfs_zlib_exit();
+        btrfs_exit_compress();
 }
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bd..bae5c7b8bbe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
        int ret;
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                return ERR_PTR(-EROFS);
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
@@ -902,6 +905,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
@@ -909,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
+        u64 root_flags;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
@@ -941,7 +946,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        trans->block_rsv = &pending->block_rsv;
        dentry = pending->dentry;
-        parent_inode = dentry->d_parent->d_inode;
+        parent = dget_parent(dentry);
+        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
@@ -965,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        root_flags = btrfs_root_flags(new_root_item);
+        if (pending->readonly)
+                root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+        else
+                root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+        btrfs_set_root_flags(new_root_item, root_flags);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_set_lock_blocking(old);
@@ -989,6 +1002,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                 parent_inode->i_ino, index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        dput(parent);
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4e..229a594cacd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
        struct btrfs_block_rsv block_rsv;
        /* extra metadata reseration for relocation */
        int error;
+        bool readonly;
        struct list_head list;
 };
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a2..054744ac571 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root;
+        struct dentry *old_parent = NULL;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
                inode = parent->d_inode;
        }
+        dput(old_parent);
 out:
        return ret;
 }
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
+        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
        }
        ret = 0;
 end_trans:
+        dput(old_parent);
        if (ret < 0) {
                BUG_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+        struct dentry *parent = dget_parent(dentry);
-                                      dentry->d_parent, 0);
+        int ret;
+        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+        dput(parent);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d..d158530233b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -412,12 +413,16 @@ static noinline int device_list_add(const char *path,
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
-        } else if (strcmp(device->name, path)) {
+        } else if (!device->name || strcmp(device->name, path)) {
                name = kstrdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
                kfree(device->name);
                device->name = name;
+                if (device->missing) {
+                        fs_devices->missing_devices--;
+                        device->missing = 0;
+                }
        }
        if (found_transid > fs_devices->latest_trans) {
@@ -489,7 +494,7 @@ again:
                        continue;
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                        fs_devices->open_devices--;
                }
@@ -523,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        fs_devices->open_devices--;
                }
                if (device->writeable) {
@@ -580,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        int seeding = 1;
        int ret = 0;
+        flags |= FMODE_EXCL;
        list_for_each_entry(device, head, dev_list) {
                if (device->bdev)
                        continue;
                if (!device->name)
                        continue;
-                bdev = open_bdev_exclusive(device->name, flags, holder);
+                bdev = blkdev_get_by_path(device->name, flags, holder);
                if (IS_ERR(bdev)) {
                        printk(KERN_INFO "open %s failed\n", device->name);
                        goto error;
@@ -594,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
-                if (!bh)
+                if (!bh) {
+                        ret = -EINVAL;
                        goto error_close;
+                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -638,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
                brelse(bh);
 error_close:
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, flags);
 error:
                continue;
        }
@@ -684,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        mutex_lock(&uuid_mutex);
-        bdev = open_bdev_exclusive(path, flags, holder);
+        flags |= FMODE_EXCL;
+        bdev = blkdev_get_by_path(path, flags, holder);
        if (IS_ERR(bdev)) {
                ret = PTR_ERR(bdev);
@@ -696,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error_close;
        bh = btrfs_read_dev_super(bdev);
        if (!bh) {
-                ret = -EIO;
+                ret = -EINVAL;
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -716,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        brelse(bh);
 error_close:
-        close_bdev_exclusive(bdev, flags);
+        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
        return ret;
 }
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length)
+{
+        struct btrfs_key key;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent;
+        struct btrfs_path *path;
+        u64 extent_end;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        *length = 0;
+        if (start >= device->total_bytes)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
+                if (ret < 0)
+                        goto out;
+        }
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto out;
+                        break;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid < device->devid)
+                        goto next;
+                if (key.objectid > device->devid)
+                        break;
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        goto next;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (key.offset <= start && extent_end > end) {
+                        *length = end - start + 1;
+                        break;
+                } else if (key.offset <= start && extent_end > start)
+                        *length += extent_end - start;
+                else if (key.offset > start && extent_end <= end)
+                        *length += extent_end - key.offset;
+                else if (key.offset > start && key.offset <= end) {
+                        *length += end - key.offset + 1;
+                        break;
+                } else if (key.offset > end)
+                        break;
+next:
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:      transaction handler
+ * @device:     the device which we search the free space in
+ * @num_bytes:  the size of the free space that we need
+ * @start:      store the start of the free space.
+ * @len:        the size of the free space. that we find, or the size of the max
+ *              free space if we don't find suitable free space
+ *
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
 */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
-                         u64 *start, u64 *max_avail)
+                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
-        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
-        u64 hole_size = 0;
+        u64 hole_size;
-        u64 last_byte = 0;
+        u64 max_hole_start;
-        u64 search_start = 0;
+        u64 max_hole_size;
+        u64 extent_end;
+        u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
-        int slot = 0;
+        int slot;
-        int start_found;
        struct extent_buffer *l;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->reada = 2;
-        start_found = 0;
        /* FIXME use last free of some kind */
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-        search_start = max((u64)1024 * 1024, search_start);
+        search_start = 1024 * 1024;
-        if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+        if (root->fs_info->alloc_start + num_bytes <= search_end)
                search_start = max(root->fs_info->alloc_start, search_start);
+        max_hole_start = search_start;
+        max_hole_size = 0;
+        if (search_start >= search_end) {
+                ret = -ENOSPC;
+                goto error;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        path->reada = 2;
        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
-                goto error;
+                goto out;
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid, key.type);
                if (ret < 0)
-                        goto error;
+                        goto out;
-                if (ret > 0)
-                        start_found = 1;
        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
                l = path->nodes[0];
                slot = path->slots[0];
@@ -783,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0)
                                continue;
                        if (ret < 0)
-                                goto error;
+                                goto out;
-no_more_items:
-                        if (!start_found) {
+                        break;
-                                if (search_start >= search_end) {
-                                        ret = -ENOSPC;
-                                        goto error;
-                                }
-                                *start = search_start;
-                                start_found = 1;
-                                goto check_pending;
-                        }
-                        *start = last_byte > search_start ?
-                                last_byte : search_start;
-                        if (search_end <= *start) {
-                                ret = -ENOSPC;
-                                goto error;
-                        }
-                        goto check_pending;
                }
                btrfs_item_key_to_cpu(l, &key, slot);
@@ -808,48 +911,62 @@ no_more_items:
                        goto next;
                if (key.objectid > device->devid)
-                        goto no_more_items;
+                        break;
-                if (key.offset >= search_start && key.offset > last_byte &&
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                    start_found) {
+                        goto next;
-                        if (last_byte < search_start)
-                                last_byte = search_start;
-                        hole_size = key.offset - last_byte;
-                        if (hole_size > *max_avail)
+                if (key.offset > search_start) {
-                                *max_avail = hole_size;
+                        hole_size = key.offset - search_start;
+                        if (hole_size > max_hole_size) {
+                                max_hole_start = search_start;
+                                max_hole_size = hole_size;
+                        }
-                        if (key.offset > last_byte &&
+                        /*
-                            hole_size >= num_bytes) {
+                         * If this free space is greater than which we need,
-                                *start = last_byte;
+                         * it must be the max free space that we have found
-                                goto check_pending;
+                         * until now, so max_hole_start must point to the start
+                         * of this free space and the length of this free space
+                         * is stored in max_hole_size. Thus, we return
+                         * max_hole_start and max_hole_size and go back to the
+                         * caller.
+                         */
+                        if (hole_size >= num_bytes) {
+                                ret = 0;
+                                goto out;
                        }
                }
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                        goto next;
-                start_found = 1;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-                last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (extent_end > search_start)
+                        search_start = extent_end;
 next:
                path->slots[0]++;
                cond_resched();
        }
-check_pending:
-        /* we have to make sure we didn't find an extent that has already
-         * been allocated by the map tree or the original allocation
-         */
-        BUG_ON(*start < search_start);
-        if (*start + num_bytes > search_end) {
+        hole_size = search_end- search_start;
-                ret = -ENOSPC;
+        if (hole_size > max_hole_size) {
-                goto error;
+                max_hole_start = search_start;
+                max_hole_size = hole_size;
        }
-        /* check for pending inserts here */
-        ret = 0;
-error:
+        /* See above. */
+        if (hole_size < num_bytes)
+                ret = -ENOSPC;
+        else
+                ret = 0;
+out:
        btrfs_free_path(path);
+error:
+        *start = max_hole_start;
+        if (len)
+                *len = max_hole_size;
        return ret;
 }
@@ -1179,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                      root->fs_info->bdev_holder);
+                                          root->fs_info->bdev_holder);
                if (IS_ERR(bdev)) {
                        ret = PTR_ERR(bdev);
                        goto out;
@@ -1189,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
                if (!bh) {
-                        ret = -EIO;
+                        ret = -EINVAL;
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1236,6 +1353,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->num_devices--;
+        if (device->missing)
+                root->fs_info->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
        if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1244,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
        if (device->bdev) {
-                close_bdev_exclusive(device->bdev, device->mode);
+                blkdev_put(device->bdev, device->mode);
                device->bdev = NULL;
                device->fs_devices->open_devices--;
        }
@@ -1287,7 +1407,7 @@ error_brelse:
        brelse(bh);
 error_close:
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
@@ -1439,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -1565,7 +1686,7 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 error:
-        close_bdev_exclusive(bdev, 0);
+        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
@@ -1905,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        mutex_lock(&dev_root->fs_info->volume_mutex);
        dev_root = dev_root->fs_info->dev_root;
@@ -2143,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
                return calc_size * num_stripes;
 }
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+/* Used to sort the devices by max_avail(descending sort) */
-                               struct btrfs_root *extent_root,
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
-                               struct map_lookup **map_ret,
-                               u64 *num_bytes, u64 *stripe_size,
-                               u64 start, u64 type)
 {
-        struct btrfs_fs_info *info = extent_root->fs_info;
+        if (((struct btrfs_device_info *)dev_info1)->max_avail >
-        struct btrfs_device *device = NULL;
+            ((struct btrfs_device_info *)dev_info2)->max_avail)
-        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+                return -1;
-        struct list_head *cur;
+        else if (((struct btrfs_device_info *)dev_info1)->max_avail <
-        struct map_lookup *map = NULL;
+                 ((struct btrfs_device_info *)dev_info2)->max_avail)
-        struct extent_map_tree *em_tree;
+                return 1;
-        struct extent_map *em;
+        else
-        struct list_head private_devs;
+                return 0;
-        int min_stripe_size = 1 * 1024 * 1024;
+}
-        u64 calc_size = 1024 * 1024 * 1024;
-        u64 max_chunk_size = calc_size;
-        u64 min_free;
-        u64 avail;
-        u64 max_avail = 0;
-        u64 dev_offset;
-        int num_stripes = 1;
-        int min_stripes = 1;
-        int sub_stripes = 0;
-        int looped = 0;
-        int ret;
-        int index;
-        int stripe_len = 64 * 1024;
-        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
-            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                                 int *num_stripes, int *min_stripes,
-                WARN_ON(1);
+                                 int *sub_stripes)
-                type &= ~BTRFS_BLOCK_GROUP_DUP;
+{
-        }
+        *num_stripes = 1;
-        if (list_empty(&fs_devices->alloc_list))
+        *min_stripes = 1;
-                return -ENOSPC;
+        *sub_stripes = 0;
        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-                num_stripes = fs_devices->rw_devices;
+                *num_stripes = fs_devices->rw_devices;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-                num_stripes = 2;
+                *num_stripes = 2;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                if (fs_devices->rw_devices < 2)
                        return -ENOSPC;
-                num_stripes = 2;
+                *num_stripes = 2;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                num_stripes = fs_devices->rw_devices;
+                *num_stripes = fs_devices->rw_devices;
-                if (num_stripes < 4)
+                if (*num_stripes < 4)
                        return -ENOSPC;
-                num_stripes &= ~(u32)1;
+                *num_stripes &= ~(u32)1;
-                sub_stripes = 2;
+                *sub_stripes = 2;
-                min_stripes = 4;
+                *min_stripes = 4;
        }
+        return 0;
+}
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+                                    u64 proposed_size, u64 type,
+                                    int num_stripes, int small_stripe)
+{
+        int min_stripe_size = 1 * 1024 * 1024;
+        u64 calc_size = proposed_size;
+        u64 max_chunk_size = calc_size;
+        int ncopies = 1;
+        if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_DUP |
+                    BTRFS_BLOCK_GROUP_RAID10))
+                ncopies = 2;
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_chunk_size = 10 * calc_size;
                min_stripe_size = 64 * 1024 * 1024;
@@ -2219,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
-again:
+        if (calc_size * num_stripes > max_chunk_size * ncopies) {
-        max_avail = 0;
+                calc_size = max_chunk_size * ncopies;
-        if (!map || map->num_stripes != num_stripes) {
-                kfree(map);
-                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-                if (!map)
-                        return -ENOMEM;
-                map->num_stripes = num_stripes;
-        }
-        if (calc_size * num_stripes > max_chunk_size) {
-                calc_size = max_chunk_size;
                do_div(calc_size, num_stripes);
-                do_div(calc_size, stripe_len);
+                do_div(calc_size, BTRFS_STRIPE_LEN);
-                calc_size *= stripe_len;
+                calc_size *= BTRFS_STRIPE_LEN;
        }
        /* we don't want tiny stripes */
-        if (!looped)
+        if (!small_stripe)
                calc_size = max_t(u64, min_stripe_size, calc_size);
        /*
-         * we're about to do_div by the stripe_len so lets make sure
+         * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
         * we end up with something bigger than a stripe
         */
-        calc_size = max_t(u64, calc_size, stripe_len * 4);
+        calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+        do_div(calc_size, BTRFS_STRIPE_LEN);
+        calc_size *= BTRFS_STRIPE_LEN;
+        return calc_size;
+}
-        do_div(calc_size, stripe_len);
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
-        calc_size *= stripe_len;
+                                                      int num_stripes)
+{
+        struct map_lookup *new;
+        size_t len = map_lookup_size(num_stripes);
+        BUG_ON(map->num_stripes < num_stripes);
+        if (map->num_stripes == num_stripes)
+                return map;
+        new = kmalloc(len, GFP_NOFS);
+        if (!new) {
+                /* just change map->num_stripes */
+                map->num_stripes = num_stripes;
+                return map;
+        }
+        memcpy(new, map, len);
+        new->num_stripes = num_stripes;
+        kfree(map);
+        return new;
+}
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_devices *fs_devices,
+                                    struct btrfs_device_info *devices,
+                                    int nr_device, u64 type,
+                                    struct map_lookup **map_lookup,
+                                    int min_stripes, u64 *stripe_size)
+{
+        int i, index, sort_again = 0;
+        int min_devices = min_stripes;
+        u64 max_avail, min_free;
+        struct map_lookup *map = *map_lookup;
+        int ret;
+        if (nr_device < min_stripes)
+                return -ENOSPC;
+        btrfs_descending_sort_devices(devices, nr_device);
+        max_avail = devices[0].max_avail;
+        if (!max_avail)
+                return -ENOSPC;
+        for (i = 0; i < nr_device; i++) {
+                /*
+                 * if dev_offset = 0, it means the free space of this device
+                 * is less than what we need, and we didn't search max avail
+                 * extent on this device, so do it now.
+                 */
+                if (!devices[i].dev_offset) {
+                        ret = find_free_dev_extent(trans, devices[i].dev,
+                                                   max_avail,
+                                                   &devices[i].dev_offset,
+                                                   &devices[i].max_avail);
+                        if (ret != 0 && ret != -ENOSPC)
+                                return ret;
+                        sort_again = 1;
+                }
+        }
+        /* we update the max avail free extent of each devices, sort again */
+        if (sort_again)
+                btrfs_descending_sort_devices(devices, nr_device);
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_devices = 1;
+        if (!devices[min_devices - 1].max_avail)
+                return -ENOSPC;
+        max_avail = devices[min_devices - 1].max_avail;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                do_div(max_avail, 2);
+        max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                                             min_stripes, 1);
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_free = max_avail * 2;
+        else
+                min_free = max_avail;
+        if (min_free > devices[min_devices - 1].max_avail)
+                return -ENOSPC;
+        map = __shrink_map_lookup_stripes(map, min_stripes);
+        *stripe_size = max_avail;
+        index = 0;
+        for (i = 0; i < min_stripes; i++) {
+                map->stripes[i].dev = devices[index].dev;
+                map->stripes[i].physical = devices[index].dev_offset;
+                if (type & BTRFS_BLOCK_GROUP_DUP) {
+                        i++;
+                        map->stripes[i].dev = devices[index].dev;
+                        map->stripes[i].physical = devices[index].dev_offset +
+                                                   max_avail;
+                }
+                index++;
+        }
+        *map_lookup = map;
+        return 0;
+}
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct map_lookup **map_ret,
+                               u64 *num_bytes, u64 *stripe_size,
+                               u64 start, u64 type)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_device *device = NULL;
+        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+        struct list_head *cur;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct btrfs_device_info *devices_info;
+        struct list_head private_devs;
+        u64 calc_size = 1024 * 1024 * 1024;
+        u64 min_free;
+        u64 avail;
+        u64 dev_offset;
+        int num_stripes;
+        int min_stripes;
+        int sub_stripes;
+        int min_devices;        /* the min number of devices we need */
+        int i;
+        int ret;
+        int index;
+        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                WARN_ON(1);
+                type &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if (list_empty(&fs_devices->alloc_list))
+                return -ENOSPC;
+        ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+                                    &min_stripes, &sub_stripes);
+        if (ret)
+                return ret;
+        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+                               GFP_NOFS);
+        if (!devices_info)
+                return -ENOMEM;
+        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+        if (!map) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        map->num_stripes = num_stripes;
        cur = fs_devices->alloc_list.next;
        index = 0;
+        i = 0;
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+        calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                                             num_stripes, 0);
+        if (type & BTRFS_BLOCK_GROUP_DUP) {
                min_free = calc_size * 2;
-        else
+                min_devices = 1;
+        } else {
                min_free = calc_size;
+                min_devices = min_stripes;
-        /*
+        }
-         * we add 1MB because we never use the first 1MB of the device, unless
-         * we've looped, then we are likely allocating the maximum amount of
-         * space left already
-         */
-        if (!looped)
-                min_free += 1024 * 1024;
        INIT_LIST_HEAD(&private_devs);
        while (index < num_stripes) {
@@ -2276,27 +2559,39 @@ again:
                cur = cur->next;
                if (device->in_fs_metadata && avail >= min_free) {
-                        ret = find_free_dev_extent(trans, device,
+                        ret = find_free_dev_extent(trans, device, min_free,
-                                                   min_free, &dev_offset,
+                                                   &devices_info[i].dev_offset,
-                                                   &max_avail);
+                                                   &devices_info[i].max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
                                map->stripes[index].dev = device;
-                                map->stripes[index].physical = dev_offset;
+                                map->stripes[index].physical =
+                                                devices_info[i].dev_offset;
                                index++;
                                if (type & BTRFS_BLOCK_GROUP_DUP) {
                                        map->stripes[index].dev = device;
                                        map->stripes[index].physical =
-                                                dev_offset + calc_size;
+                                                devices_info[i].dev_offset +
+                                                calc_size;
                                        index++;
                                }
-                        }
+                        } else if (ret != -ENOSPC)
-                } else if (device->in_fs_metadata && avail > max_avail)
+                                goto error;
-                        max_avail = avail;
+                        devices_info[i].dev = device;
+                        i++;
+                } else if (device->in_fs_metadata &&
+                           avail >= BTRFS_STRIPE_LEN) {
+                        devices_info[i].dev = device;
+                        devices_info[i].max_avail = avail;
+                        i++;
+                }
                if (cur == &fs_devices->alloc_list)
                        break;
        }
        list_splice(&private_devs, &fs_devices->alloc_list);
        if (index < num_stripes) {
                if (index >= min_stripes) {
@@ -2305,34 +2600,36 @@ again:
                                num_stripes /= sub_stripes;
                                num_stripes *= sub_stripes;
                        }
-                        looped = 1;
-                        goto again;
+                        map = __shrink_map_lookup_stripes(map, num_stripes);
-                }
+                } else if (i >= min_devices) {
-                if (!looped && max_avail > 0) {
+                        ret = __btrfs_alloc_tiny_space(trans, fs_devices,
-                        looped = 1;
+                                                       devices_info, i, type,
-                        calc_size = max_avail;
+                                                       &map, min_stripes,
-                        goto again;
+                                                       &calc_size);
+                        if (ret)
+                                goto error;
+                } else {
+                        ret = -ENOSPC;
+                        goto error;
                }
-                kfree(map);
-                return -ENOSPC;
        }
        map->sector_size = extent_root->sectorsize;
-        map->stripe_len = stripe_len;
+        map->stripe_len = BTRFS_STRIPE_LEN;
-        map->io_align = stripe_len;
+        map->io_align = BTRFS_STRIPE_LEN;
-        map->io_width = stripe_len;
+        map->io_width = BTRFS_STRIPE_LEN;
        map->type = type;
-        map->num_stripes = num_stripes;
        map->sub_stripes = sub_stripes;
        *map_ret = map;
        *stripe_size = calc_size;
        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                         num_stripes, sub_stripes);
+                                         map->num_stripes, sub_stripes);
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                kfree(map);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
@@ -2365,7 +2662,13 @@ again:
                index++;
        }
+        kfree(devices_info);
        return 0;
+error:
+        kfree(map);
+        kfree(devices_info);
+        return ret;
 }
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -3080,7 +3383,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
+        device->missing = 1;
        fs_devices->num_devices++;
+        fs_devices->missing_devices++;
        spin_lock_init(&device->io_lock);
        INIT_LIST_HEAD(&device->dev_alloc_list);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3583,15 @@ static int read_one_dev(struct btrfs_root *root,
                        device = add_missing_dev(root, devid, dev_uuid);
                        if (!device)
                                return -ENOMEM;
+                } else if (!device->missing) {
+                        /*
+                         * this happens when a device that was properly setup
+                         * in the device info lists suddenly goes bad.
+                         * device->bdev is NULL, and so we have to set
+                         * device->missing to one here
+                         */
+                        root->fs_info->fs_devices->missing_devices++;
+                        device->missing = 1;
                }
        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4ee..7fb59d45fe8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
 #define __BTRFS_VOLUMES_
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
+#define BTRFS_STRIPE_LEN        (64 * 1024)
 struct buffer_head;
 struct btrfs_pending_bios {
        struct bio *head;
@@ -44,12 +47,13 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
+        int missing;
        spinlock_t io_lock;
        struct block_device *bdev;
-        /* the mode sent to open_bdev_exclusive */
+        /* the mode sent to blkdev_get */
        fmode_t mode;
        char *name;
@@ -93,6 +97,7 @@ struct btrfs_fs_devices {
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
+        u64 missing_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
@@ -134,6 +139,30 @@ struct btrfs_multi_bio {
        struct btrfs_bio_stripe stripes[];
 };
+struct btrfs_device_info {
+        struct btrfs_device *dev;
+        u64 dev_offset;
+        u64 max_avail;
+};
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+                                        struct btrfs_device_info *devices,
+                                        size_t nr_devices)
+{
+        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+             btrfs_cmp_device_free_bytes, NULL);
+}
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length);
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739..a5776531dc2 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71..f5ec2d44150 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
 #include <linux/bio.h>
 #include "compression.h"
-/* Plan: call deflate() with avail_in == *sourcelen,
-        avail_out = *dstlen - 12 and flush == Z_FINISH.
-        If it doesn't manage to finish, call it again with
-        avail_in == 0 and avail_out set to the remaining 12
-        bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
 struct workspace {
        z_stream inf_strm;
        z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
        struct list_head list;
 };
-static LIST_HEAD(idle_workspace);
+static void zlib_free_workspace(struct list_head *ws)
-static DEFINE_SPINLOCK(workspace_lock);
+{
-static unsigned long num_workspace;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
-/*
+        vfree(workspace->def_strm.workspace);
- * this finds an available zlib workspace or allocates a new one
+        vfree(workspace->inf_strm.workspace);
- * NULL or an ERR_PTR is returned if things go bad.
+        kfree(workspace->buf);
- */
+        kfree(workspace);
-static struct workspace *find_zlib_workspace(void)
+}
+static struct list_head *zlib_alloc_workspace(void)
 {
        struct workspace *workspace;
-        int ret;
-        int cpus = num_online_cpus();
-again:
-        spin_lock(&workspace_lock);
-        if (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
-                                       list);
-                list_del(&workspace->list);
-                num_workspace--;
-                spin_unlock(&workspace_lock);
-                return workspace;
-        }
-        spin_unlock(&workspace_lock);
-        if (atomic_read(&alloc_workspace) > cpus) {
-                DEFINE_WAIT(wait);
-                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-                if (atomic_read(&alloc_workspace) > cpus)
-                        schedule();
-                finish_wait(&workspace_wait, &wait);
-                goto again;
-        }
-        atomic_inc(&alloc_workspace);
        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-        if (!workspace) {
+        if (!workspace)
-                ret = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                goto fail;
-        }
        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-        if (!workspace->def_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail;
-        }
        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-        if (!workspace->inf_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail_inflate;
-        }
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-        if (!workspace->buf) {
+        if (!workspace->def_strm.workspace ||
-                ret = -ENOMEM;
+            !workspace->inf_strm.workspace || !workspace->buf)
-                goto fail_kmalloc;
+                goto fail;
-        }
-        return workspace;
-fail_kmalloc:
-        vfree(workspace->inf_strm.workspace);
-fail_inflate:
-        vfree(workspace->def_strm.workspace);
-fail:
-        kfree(workspace);
-        atomic_dec(&alloc_workspace);
-        wake_up(&workspace_wait);
-        return ERR_PTR(ret);
-}
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-        spin_lock(&workspace_lock);
-        if (num_workspace < num_online_cpus()) {
-                list_add_tail(&workspace->list, &idle_workspace);
-                num_workspace++;
-                spin_unlock(&workspace_lock);
-                if (waitqueue_active(&workspace_wait))
-                        wake_up(&workspace_wait);
-                return 0;
-        }
-        spin_unlock(&workspace_lock);
-        vfree(workspace->def_strm.workspace);
-        vfree(workspace->inf_strm.workspace);
-        kfree(workspace->buf);
-        kfree(workspace);
-        atomic_dec(&alloc_workspace);
+        INIT_LIST_HEAD(&workspace->list);
-        if (waitqueue_active(&workspace_wait))
-                wake_up(&workspace_wait);
-        return 0;
-}
-/*
+        return &workspace->list;
- * cleanup function for module exit
+fail:
- */
+        zlib_free_workspace(&workspace->list);
-static void free_workspaces(void)
+        return ERR_PTR(-ENOMEM);
-{
-        struct workspace *workspace;
-        while (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
-                                       list);
-                list_del(&workspace->list);
-                vfree(workspace->def_strm.workspace);
-                vfree(workspace->inf_strm.workspace);
-                kfree(workspace->buf);
-                kfree(workspace);
-                atomic_dec(&alloc_workspace);
-        }
 }
-/*
+static int zlib_compress_pages(struct list_head *ws,
- * given an address space and start/len, compress the bytes.
+                               struct address_space *mapping,
- *
+                               u64 start, unsigned long len,
- * pages are allocated to hold the compressed result and stored
+                               struct page **pages,
- * in 'pages'
+                               unsigned long nr_dest_pages,
- *
+                               unsigned long *out_pages,
- * out_pages is used to return the number of pages allocated.  There
+                               unsigned long *total_in,
- * may be pages allocated even if we return an error
+                               unsigned long *total_out,
- *
+                               unsigned long max_out)
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-                              u64 start, unsigned long len,
-                              struct page **pages,
-                              unsigned long nr_dest_pages,
-                              unsigned long *out_pages,
-                              unsigned long *total_in,
-                              unsigned long *total_out,
-                              unsigned long max_out)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret;
-        struct workspace *workspace;
        char *data_in;
        char *cpage_out;
        int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_out = 0;
        *total_in = 0;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
                printk(KERN_WARNING "deflateInit failed\n");
                ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        data_in = kmap(in_page);
        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -1;
+                goto out;
+        }
        cpage_out = kmap(out_page);
        pages[0] = out_page;
        nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
                                goto out;
                        }
                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        if (out_page == NULL) {
+                                ret = -1;
+                                goto out;
+                        }
                        cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
@@ -314,55 +208,26 @@ out:
                kunmap(in_page);
                page_cache_release(in_page);
        }
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
- * pages_in is an array of pages with compressed data.
+                                  u64 disk_start,
- *
+                                  struct bio_vec *bvec,
- * disk_start is the starting logical offset of this array in the file
+                                  int vcnt,
- *
+                                  size_t srclen)
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-                              u64 disk_start,
-                              struct bio_vec *bvec,
-                              int vcnt,
-                              size_t srclen)
 {
-        int ret = 0;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        char *data_in;
        size_t total_out = 0;
-        unsigned long page_bytes_left;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-        struct page *page_out;
        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
-        unsigned long buf_offset;
-        unsigned long bytes;
-        unsigned long working_bytes;
        unsigned long pg_offset;
-        unsigned long start_byte;
-        unsigned long current_buf_start;
-        char *kaddr;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
        workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        workspace->inf_strm.total_out = 0;
        workspace->inf_strm.next_out = workspace->buf;
        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-        page_out = bvec[page_out_index].bv_page;
-        page_bytes_left = PAGE_CACHE_SIZE;
        pg_offset = 0;
        /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (workspace->inf_strm.total_in < srclen) {
                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
-                /*
-                 * buf start is the byte offset we're of the start of
-                 * our workspace buffer
-                 */
-                buf_start = total_out;
-                /* total_out is the last byte of the workspace buffer */
+                buf_start = total_out;
                total_out = workspace->inf_strm.total_out;
-                working_bytes = total_out - buf_start;
+                /* we didn't make progress in this inflate call, we're done */
+                if (buf_start == total_out)
-                /*
-                 * start byte is the first byte of the page we're currently
-                 * copying into relative to the start of the compressed data.
-                 */
-                start_byte = page_offset(page_out) - disk_start;
-                if (working_bytes == 0) {
-                        /* we didn't make progress in this inflate
-                         * call, we're done
-                         */
-                        if (ret != Z_STREAM_END)
-                                ret = -1;
                        break;
-                }
-                /* we haven't yet hit data corresponding to this page */
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-                if (total_out <= start_byte)
+                                                 total_out, disk_start,
-                        goto next;
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
-                /*
+                if (ret2 == 0) {
-                 * the start of the data we care about is offset into
+                        ret = 0;
-                 * the middle of our working buffer
+                        goto done;
-                 */
-                if (total_out > start_byte && buf_start < start_byte) {
-                        buf_offset = start_byte - buf_start;
-                        working_bytes -= buf_offset;
-                } else {
-                        buf_offset = 0;
-                }
-                current_buf_start = buf_start;
-                /* copy bytes from the working buffer into the pages */
-                while (working_bytes > 0) {
-                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
-                                    PAGE_CACHE_SIZE - buf_offset);
-                        bytes = min(bytes, working_bytes);
-                        kaddr = kmap_atomic(page_out, KM_USER0);
-                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-                               bytes);
-                        kunmap_atomic(kaddr, KM_USER0);
-                        flush_dcache_page(page_out);
-                        pg_offset += bytes;
-                        page_bytes_left -= bytes;
-                        buf_offset += bytes;
-                        working_bytes -= bytes;
-                        current_buf_start += bytes;
-                        /* check if we need to pick another page */
-                        if (page_bytes_left == 0) {
-                                page_out_index++;
-                                if (page_out_index >= vcnt) {
-                                        ret = 0;
-                                        goto done;
-                                }
-                                page_out = bvec[page_out_index].bv_page;
-                                pg_offset = 0;
-                                page_bytes_left = PAGE_CACHE_SIZE;
-                                start_byte = page_offset(page_out) - disk_start;
-                                /*
-                                 * make sure our new page is covered by this
-                                 * working buffer
-                                 */
-                                if (total_out <= start_byte)
-                                        goto next;
-                                /* the next page in the biovec might not
-                                 * be adjacent to the last page, but it
-                                 * might still be found inside this working
-                                 * buffer.  bump our offset pointer
-                                 */
-                                if (total_out > start_byte &&
-                                    current_buf_start < start_byte) {
-                                        buf_offset = start_byte - buf_start;
-                                        working_bytes = total_out - start_byte;
-                                        current_buf_start = buf_start +
-                                                buf_offset;
-                                }
-                        }
                }
-next:
                workspace->inf_strm.next_out = workspace->buf;
                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
@@ -516,35 +301,21 @@ done:
        zlib_inflateEnd(&workspace->inf_strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
-out:
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- * a less complex decompression routine.  Our compressed data fits in a
+                           struct page *dest_page,
- * single page, and we want to read a single page out of it.
+                           unsigned long start_byte,
- * start_byte tells us the offset into the compressed data we're interested in
+                           size_t srclen, size_t destlen)
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-                          struct page *dest_page,
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret = 0;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        unsigned long bytes_left = destlen;
        unsigned long total_out = 0;
        char *kaddr;
-        if (destlen > PAGE_CACHE_SIZE)
-                return -ENOMEM;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
        workspace->inf_strm.avail_in = srclen;
        workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
                ret = 0;
        zlib_inflateEnd(&workspace->inf_strm);
-out:
-        free_workspace(workspace);
        return ret;
 }
-void btrfs_zlib_exit(void)
+struct btrfs_compress_op btrfs_zlib_compress = {
-{
+        .alloc_workspace        = zlib_alloc_workspace,
-    free_workspaces();
+        .free_workspace         = zlib_free_workspace,
-}
+        .compress_pages         = zlib_compress_pages,
+        .decompress_biovec      = zlib_decompress_biovec,
+        .decompress             = zlib_decompress,
+};
diff --git a/fs/buffer.c b/fs/buffer.c
index 5930e382959..2219a76e2ca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
        struct buffer_head *evictee = NULL;
-        struct bh_lru *lru;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
+        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
-        if (lru->bhs[0] != bh) {
                struct buffer_head *bhs[BH_LRU_SIZE];
                int in;
                int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
                get_bh(bh);
                bhs[out++] = bh;
                for (in = 0; in < BH_LRU_SIZE; in++) {
-                        struct buffer_head *bh2 = lru->bhs[in];
+                        struct buffer_head *bh2 =
+                                __this_cpu_read(bh_lrus.bhs[in]);
                        if (bh2 == bh) {
                                __brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(lru->bhs, bhs, sizeof(bhs));
+                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -1313,23 +1312,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
        struct buffer_head *ret = NULL;
-        struct bh_lru *lru;
        unsigned int i;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
-                struct buffer_head *bh = lru->bhs[i];
+                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
                if (bh && bh->b_bdev == bdev &&
                                bh->b_blocknr == block && bh->b_size == size) {
                        if (i) {
                                while (i) {
-                                        lru->bhs[i] = lru->bhs[i - 1];
+                                        __this_cpu_write(bh_lrus.bhs[i],
+                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
-                                lru->bhs[0] = bh;
+                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
@@ -3203,22 +3201,23 @@ static void recalc_bh_state(void)
        int i;
        int tot = 0;
-        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
-        __get_cpu_var(bh_accounting).ratelimit = 0;
+        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
-                get_cpu_var(bh_accounting).nr++;
+                preempt_disable();
+                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
-                put_cpu_var(bh_accounting);
+                preempt_enable();
        }
        return ret;
 }
@@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        get_cpu_var(bh_accounting).nr--;
+        preempt_disable();
+        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
-        put_cpu_var(bh_accounting);
+        preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
@@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
-        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
-        put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff..bd352125e82 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
 # Makefile for CEPH filesystem.
 #
-ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
        mds_client.o mdsmap.o strings.o ceph_frag.o \
        debugfs.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-default: all
-all:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-endif
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e..561438b6a50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                  page->index << PAGE_CACHE_SHIFT, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
-                                  &page, 1);
+                                  &page, 1, 0);
        if (err == -ENOENT)
                err = 0;
        if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                 offset, &len,
                                 ci->i_truncate_seq, ci->i_truncate_size,
-                                 pages, nr_pages);
+                                 pages, nr_pages, 0);
        if (rc == -ENOENT)
                rc = 0;
        if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
                                            snapc, do_sync,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
-                                            &inode->i_mtime, true, 1);
+                                            &inode->i_mtime, true, 1, 0);
                                max_pages = req->r_num_pages;
                                alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71..6b61ded701e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
            invalidating_gen == ci->i_rdcache_gen) {
                /* success. */
                dout("try_nonblocking_invalidate %p success\n", inode);
-                ci->i_rdcache_gen = 0;
+                /* save any racing async invalidate some trouble */
-                ci->i_rdcache_revoking = 0;
+                ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
                return 0;
        }
        dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1560,9 +1560,10 @@ retry_locked:
                /* NOTE: no side-effects allowed, until we take s_mutex */
                revoking = cap->implemented & ~cap->issued;
-                if (revoking)
+                dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
-                        dout(" mds%d revoking %s\n", cap->mds,
+                     cap->mds, cap, ceph_cap_string(cap->issued),
-                             ceph_cap_string(revoking));
+                     ceph_cap_string(cap->implemented),
+                     ceph_cap_string(revoking));
                if (cap == ci->i_auth_cap &&
                    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
                        flushing = __mark_caps_flushing(inode, session);
+                else
+                        flushing = 0;
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
        }
 }
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_session *session,
+                                     struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        int delayed = 0;
+        spin_lock(&inode->i_lock);
+        cap = ci->i_auth_cap;
+        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+        __ceph_flush_snaps(ci, &session, 1);
+        if (ci->i_flushing_caps) {
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     ci->i_flushing_caps, NULL);
+                if (delayed) {
+                        spin_lock(&inode->i_lock);
+                        __cap_delay_requeue(mdsc, ci);
+                        spin_unlock(&inode->i_lock);
+                }
+        } else {
+                spin_unlock(&inode->i_lock);
+        }
+}
 /*
 * Take references to capabilities we hold, so that we don't release
@@ -2273,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        unsigned seq = le32_to_cpu(grant->seq);
+        int seq = le32_to_cpu(grant->seq);
-        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2383,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
-        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2689,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        ceph_add_cap(inode, session, cap_id, -1,
                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
                     NULL /* no caps context */);
-        try_flush_caps(inode, session, NULL);
+        kick_flushing_inode_caps(mdsc, session, inode);
        up_read(&mdsc->snap_rwsem);
+        /* make sure we re-request max_size, if necessary */
+        spin_lock(&inode->i_lock);
+        ci->i_requested_max_size = 0;
+        spin_unlock(&inode->i_lock);
 }
 /*
@@ -2782,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, snaptrace_len);
-                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                ceph_check_caps(ceph_inode(inode), 0, session);
-                                session);
                goto done_unlocked;
        }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b5..08f65faac11 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
                req = rb_entry(rp, struct ceph_mds_request, r_node);
-                if (req->r_request)
+                if (req->r_request && req->r_session)
-                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid,
-                else
+                                   req->r_session->s_mds);
+                else if (!req->r_request)
                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                else
+                        seq_printf(s, "%lld\t(no session)\t", req->r_tid);
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcaf..0bc68de8edd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-                dentry->d_op = &ceph_dentry_ops;
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                d_set_d_op(dentry, &ceph_dentry_ops);
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                dentry->d_op = &ceph_snapdir_dentry_ops;
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
        else
-                dentry->d_op = &ceph_snap_dentry_ops;
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
@@ -111,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
             last);
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || (last &&
+        if (filp->f_pos == 2 || last == NULL ||
-                                 filp->f_pos < ceph_dentry(last)->offset)) {
+            filp->f_pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -135,6 +136,7 @@ more:
                        fi->at_end = 1;
                        goto out_unlock;
                }
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -144,13 +146,15 @@ more:
                     dentry->d_name.len, dentry->d_name.name, di->offset,
                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
+                spin_unlock(&dentry->d_lock);
                p = p->prev;
                dentry = list_entry(p, struct dentry, d_u.d_child);
                di = ceph_dentry(dentry);
        }
-        atomic_inc(&dentry->d_count);
+        dget_dlock(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -176,19 +180,19 @@ more:
        filp->f_pos++;
-        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
                err = -EAGAIN;
                goto out;
        }
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        p = p->prev;    /* advance to next dentry */
        goto more;
 out_unlock:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 out:
        if (last)
                dput(last);
@@ -336,7 +340,10 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 2;
+                        if (ceph_frag_is_rightmost(frag))
+                                fi->next_offset = 2;
+                        else
+                                fi->next_offset = 0;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -355,18 +362,22 @@ more:
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
+                struct ceph_vino vino;
+                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
                     off, off - fi->offset, rinfo->dir_nr, pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
                ftype = le32_to_cpu(in->mode) >> 12;
+                vino.ino = le64_to_cpu(in->ino);
+                vino.snap = le64_to_cpu(in->snapid);
+                ino = ceph_vino_to_ino(vino);
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            pos, ino, ftype) < 0) {
-                            le64_to_cpu(in->ino),
-                            ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -414,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                fi->last_readdir = NULL;
        }
        kfree(fi->last_name);
+        fi->last_name = NULL;
        fi->next_offset = 2;  /* compensate for . and .. */
        if (fi->dentry) {
                dput(fi->dentry);
@@ -978,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1207,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
        }
 }
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+        struct inode *dir = dn->d_parent->d_inode;
+        struct ceph_inode_info *dci = ceph_inode(dir);
+        switch (dci->i_dir_layout.dl_dir_hash) {
+        case 0: /* for backward compat */
+        case CEPH_STR_HASH_LINUX:
+                return dn->d_name.hash;
+        default:
+                return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+                                     dn->d_name.name, dn->d_name.len);
+        }
+}
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
        .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d942699..e41056174bf 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
-                cfh->parent_name_hash = parent->d_name.hash;
+                cfh->parent_name_hash = ceph_dentry_hash(parent);
                *max_len = connected_handle_length;
                type = 2;
        } else if (*max_len >= handle_length) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf369..7d0e4a82d89 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
        }
        /*
-         * No need to block if we have any caps.  Update wanted set
+         * No need to block if we have caps on the auth MDS (for
+         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
        spin_lock(&inode->i_lock);
-        if (__ceph_is_any_real_caps(ci)) {
+        if (__ceph_is_any_real_caps(ci) &&
+            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
                int issued = __ceph_caps_issued(ci, NULL);
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof)
+                        int *checkeof, bool align_to_pages,
+                        unsigned long buf_align)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len;
+        int io_align, page_align;
        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
        int left, pages_left;
        int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
+        io_align = off & ~PAGE_MASK;
 more:
+        if (align_to_pages)
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+        else
+                page_align = pos & ~PAGE_MASK;
        this_len = left;
        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
-                                  page_pos, pages_left);
+                                  page_pos, pages_left, page_align);
        hit_stripe = this_len < left;
        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
        struct inode *inode = file->f_dentry->d_inode;
        struct page **pages;
        u64 off = *poff;
-        int num_pages = calc_pages_for(off, len);
+        int num_pages, ret;
-        int ret;
        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, off, len);
+                num_pages = calc_pages_for((unsigned long)data, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, true);
-                /*
-                 * flush any page cache pages in this range.  this
-                 * will make concurrent normal and O_DIRECT io slow,
-                 * but it will at least behave sensibly when they are
-                 * in sequence.
-                 */
        } else {
+                num_pages = calc_pages_for(off, len);
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
+        /*
+         * flush any page cache pages in this range.  this
+         * will make concurrent normal and sync io slow,
+         * but it will at least behave sensibly when they are
+         * in sequence.
+         */
        ret = filemap_write_and_wait(inode->i_mapping);
        if (ret < 0)
                goto done;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+                           file->f_flags & O_DIRECT,
+                           (unsigned long)data & ~PAGE_MASK);
        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 done:
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, true);
        else
                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        int flags;
        int do_sync = 0;
        int check_caps = 0;
+        int page_align, io_align;
+        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        else
                pos = *offset;
+        io_align = pos & ~PAGE_MASK;
+        buf_align = (unsigned long)data & ~PAGE_MASK;
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
        if (ret < 0)
                return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
         */
 more:
        len = left;
+        if (file->f_flags & O_DIRECT) {
+                /* write from beginning of first page, regardless of
+                   io alignment */
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+                num_pages = calc_pages_for((unsigned long)data, len);
+        } else {
+                page_align = pos & ~PAGE_MASK;
+                num_pages = calc_pages_for(pos, len);
+        }
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), pos, &len,
                                    CEPH_OSD_OP_WRITE, flags,
                                    ci->i_snap_realm->cached_context,
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
-                                    &mtime, false, 2);
+                                    &mtime, false, 2, page_align);
        if (!req)
                return -ENOMEM;
-        num_pages = calc_pages_for(pos, len);
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -549,7 +572,7 @@ more:
        }
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, false);
        else if (file->f_flags & O_SYNC)
                ceph_release_page_vector(pages, num_pages);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04..5625463aa47 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
@@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_release_count = 0;
        ci->i_symlink = NULL;
+        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -369,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        return &ci->vfs_inode;
 }
+static void ceph_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
 void ceph_destroy_inode(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -408,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-        kmem_cache_free(ceph_inode_cachep, ci);
+        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
@@ -471,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_FILE_WR|
-                      CEPH_CAP_FILE_BUFFER)) {
+                      CEPH_CAP_FILE_BUFFER|
+                      CEPH_CAP_AUTH_EXCL|
+                      CEPH_CAP_XATTR_EXCL)) {
                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                        warn = 1;
                }
        } else {
-                /* we have no write caps; whatever the MDS says is true */
+                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
                        inode->i_ctime = *ctime;
                        inode->i_mtime = *mtime;
@@ -567,12 +579,17 @@ static int fill_inode(struct inode *inode,
        /*
         * provided version will be odd if inode value is projected,
-         * even if stable.  skip the update if we have a newer info
+         * even if stable.  skip the update if we have newer stable
-         * (e.g., due to inode info racing form multiple MDSs), or if
+         * info (ours>=theirs, e.g. due to racing mds replies), unless
-         * we are getting projected (unstable) inode info.
+         * we are getting projected (unstable) info (in which case the
+         * version is odd, and we want ours>theirs).
+         *   us   them
+         *   2    2     skip
+         *   3    2     skip
+         *   3    3     update
         */
        if (le64_to_cpu(info->version) > 0 &&
-            (ci->i_version & ~1) > le64_to_cpu(info->version))
+            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
        issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +623,14 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        ci->i_max_size = le64_to_cpu(info->max_size);
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -667,6 +691,8 @@ static int fill_inode(struct inode *inode,
                inode->i_op = &ceph_dir_iops;
                inode->i_fop = &ceph_dir_fops;
+                ci->i_dir_layout = iinfo->dir_layout;
                ci->i_files = le64_to_cpu(info->files);
                ci->i_subdirs = le64_to_cpu(info->subdirs);
                ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -684,10 +710,6 @@ static int fill_inode(struct inode *inode,
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
-                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-                        inode->i_size = ci->i_rbytes;
                break;
        default:
                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -828,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di->offset = ceph_inode(inode)->i_max_offset++;
        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&dir->d_lock);
-        spin_lock(&dn->d_lock);
+        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dir->d_lock);
 }
 /*
@@ -866,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, atomic_read(&dn->d_count),
+                     dn, dn->d_count,
-                     realdn, atomic_read(&realdn->d_count),
+                     realdn, realdn->d_count,
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -1055,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                if (!dn->d_inode) {
+                in = dn->d_inode;
+                if (!in) {
                        in = ceph_get_inode(sb, vino);
                        if (IS_ERR(in)) {
                                pr_err("fill_trace bad get_inode "
@@ -1217,11 +1240,11 @@ retry_lookup:
                        goto retry_lookup;
                } else {
                        /* reorder parent's d_subdirs */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&parent->d_lock);
-                        spin_lock(&dn->d_lock);
+                        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
                        spin_unlock(&dn->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                }
                di = dn->d_fsdata;
@@ -1386,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        spin_lock(&inode->i_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
-        if (ci->i_rdcache_gen == 0 ||
+        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
                /* nevermind! */
-                ci->i_rdcache_revoking = 0;
                spin_unlock(&inode->i_lock);
                goto out;
        }
@@ -1400,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
        ceph_invalidate_nondirty_pages(inode->i_mapping);
        spin_lock(&inode->i_lock);
-        if (orig_gen == ci->i_rdcache_gen) {
+        if (orig_gen == ci->i_rdcache_gen &&
+            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
                     ci->i_rdcache_gen);
-                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking--;
-                ci->i_rdcache_revoking = 0;
                check = 1;
        } else {
-                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-                     inode, orig_gen, ci->i_rdcache_gen);
+                     inode, orig_gen, ci->i_rdcache_gen,
+                     ci->i_rdcache_revoking);
        }
        spin_unlock(&inode->i_lock);
@@ -1739,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
                return 0;
        }
-        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
                return 0;
@@ -1760,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
 * Check inode permissions.  We verify we have a valid value for
 * the AUTH cap, then call the generic handler.
 */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        int err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
        if (!err)
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
        return err;
 }
@@ -1789,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                else
                        stat->dev = 0;
                if (S_ISDIR(inode->i_mode)) {
-                        stat->size = ci->i_rbytes;
+                        if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+                                                RBYTES))
+                                stat->size = ci->i_rbytes;
+                        else
+                                stat->size = ci->i_files + ci->i_subdirs;
                        stat->blocks = 0;
                        stat->blksize = 65536;
                }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb..52e8fd74d45 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c34..476b329867d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
 * Implement fcntl and flock locking functions.
 */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-                             u64 pid, u64 pid_ns,
+                             int cmd, u8 wait, struct file_lock *fl)
-                             int cmd, u64 start, u64 length, u8 wait)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_mds_client *mdsc =
                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
+        u64 length = 0;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = igrab(inode);
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
             "length: %llu, wait: %d, type`: %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
-        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
        /* This should be adjusted, but I'm not sure if
           namespaces actually get id numbers*/
        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)pid_ns);
+                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
-        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
+        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_RDLCK;
+                else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_WRLCK;
+                else
+                        fl->fl_type = F_UNLCK;
+                fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+                length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+                                                 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+                if (length >= 1)
+                        fl->fl_end = length -1;
+                else
+                        fl->fl_end = 0;
+        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd, err);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type, err);
        return err;
 }
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        if (LLONG_MAX == fl->fl_end)
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
-        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid,
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
-                dout("mds locked, locking locally");
+                if ( op != CEPH_MDS_OP_GETFILELOCK ){
-                err = posix_lock_file(file, fl, NULL);
+                        dout("mds locked, locking locally");
-                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        err = posix_lock_file(file, fl, NULL);
-                        /* undo! This should only happen if the kernel detects
+                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                         * local deadlock. */
+                                /* undo! This should only happen if the kernel detects
-                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                 * local deadlock. */
-                                          (u64)fl->fl_pid,
+                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)(unsigned long)fl->fl_nspid,
+                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                dout("got %d on posix_lock_file, undid lock", err);
-                                          length, 0);
+                        }
-                        dout("got %d on posix_lock_file, undid lock", err);
                }
        } else {
                dout("mds returned error code %d", err);
        }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        /* mds requires start and length rather than start and end */
-        if (LLONG_MAX == fl->fl_end)
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid,
+                                file, lock_cmd, wait, fl);
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
                err = flock_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
-                                          file, (u64)fl->fl_pid,
+                                          file, CEPH_LOCK_UNLOCK, 0, fl);
-                                          (u64)(unsigned long)fl->fl_nspid,
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
-                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
        } else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c..a1ee8fa3a8e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include "super.h"
 #include "mds_client.h"
@@ -61,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
 * parse individual inode info
 */
 static int parse_reply_info_in(void **p, void *end,
-                               struct ceph_mds_reply_info_in *info)
+                               struct ceph_mds_reply_info_in *info,
+                               int features)
 {
        int err = -EIO;
@@ -75,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
        info->symlink = *p;
        *p += info->symlink_len;
+        if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+                ceph_decode_copy_safe(p, end, &info->dir_layout,
+                                      sizeof(info->dir_layout), bad);
+        else
+                memset(&info->dir_layout, 0, sizeof(info->dir_layout));
        ceph_decode_32_safe(p, end, info->xattr_len, bad);
        ceph_decode_need(p, end, info->xattr_len, bad);
        info->xattr_data = *p;
@@ -89,12 +95,13 @@ bad:
 * target inode.
 */
 static int parse_reply_info_trace(void **p, void *end,
-                                  struct ceph_mds_reply_info_parsed *info)
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
 {
        int err;
        if (info->head->is_dentry) {
-                err = parse_reply_info_in(p, end, &info->diri);
+                err = parse_reply_info_in(p, end, &info->diri, features);
                if (err < 0)
                        goto out_bad;
@@ -115,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
        }
        if (info->head->is_target) {
-                err = parse_reply_info_in(p, end, &info->targeti);
+                err = parse_reply_info_in(p, end, &info->targeti, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -135,7 +142,8 @@ out_bad:
 * parse readdir results
 */
 static int parse_reply_info_dir(void **p, void *end,
-                                struct ceph_mds_reply_info_parsed *info)
+                                struct ceph_mds_reply_info_parsed *info,
+                                int features)
 {
        u32 num, i = 0;
        int err;
@@ -183,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
                if (err < 0)
                        goto out_bad;
                i++;
@@ -203,10 +211,45 @@ out_bad:
 }
 /*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                                     struct ceph_mds_reply_info_parsed *info,
+                                     int features)
+{
+        if (*p + sizeof(*info->filelock_reply) > end)
+                goto bad;
+        info->filelock_reply = *p;
+        *p += sizeof(*info->filelock_reply);
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        return -EIO;
+}
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
+{
+        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+                return parse_reply_info_filelock(p, end, info, features);
+        else
+                return parse_reply_info_dir(p, end, info, features);
+}
+/*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
-                            struct ceph_mds_reply_info_parsed *info)
+                            struct ceph_mds_reply_info_parsed *info,
+                            int features)
 {
        void *p, *end;
        u32 len;
@@ -219,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* trace */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_trace(&p, p+len, info);
+                err = parse_reply_info_trace(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
-        /* dir content */
+        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_dir(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -529,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
+        req->r_uid = current_fsuid();
+        req->r_gid = current_fsgid();
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
@@ -620,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else {
                        /* dir + name */
                        inode = dir;
-                        hash = req->r_dentry->d_name.hash;
+                        hash = ceph_dentry_hash(req->r_dentry);
                        is_hash = true;
                }
        }
@@ -647,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (%d/%d)\n",
                                     inode, ceph_vinop(inode),
-                                     frag.frag, frag.mds,
+                                     frag.frag, mds,
                                     (int)r, frag.ndist);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                        /* since this file/dir wasn't known to be
@@ -662,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (auth)\n",
                                     inode, ceph_vinop(inode), frag.frag, mds);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                }
        }
@@ -1452,7 +1502,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, atomic_read(&dentry->d_count), *base, len, path);
+             dentry, dentry->d_count, *base, len, path);
        return path;
 }
@@ -1588,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
        head->op = cpu_to_le32(req->r_op);
-        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_uid = cpu_to_le32(req->r_uid);
-        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->caller_gid = cpu_to_le32(req->r_gid);
        head->args = req->r_args;
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -1659,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int flags = 0;
-        req->r_mds = mds;
        req->r_attempts++;
        if (req->r_inode) {
                struct ceph_cap *cap =
@@ -1746,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                goto finish;
        }
+        put_request_session(req);
        mds = __choose_mds(mdsc, req);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1763,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                        goto finish;
                }
        }
+        req->r_session = get_session(session);
        dout("do_request mds%d session %p state %s\n", mds, session,
             session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1775,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
        }
        /* send request */
-        req->r_session = get_session(session);
        req->r_resend_mds = -1;   /* forget any previous mds hint */
        if (req->r_request_started == 0)   /* note request start time */
@@ -1829,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
-                        put_request_session(req);
                        __do_request(mdsc, req);
                }
        }
@@ -2022,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                } else  {
                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-                        struct ceph_cap *cap =
+                        struct ceph_cap *cap = NULL;
-                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        if (req->r_session)
+                                cap = ceph_get_cap_for_mds(ci,
+                                                   req->r_session->s_mds);
                        dout("already using auth");
                        if ((!cap || cap != ci->i_auth_cap) ||
@@ -2067,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        dout("handle_reply tid %lld result %d\n", tid, result);
        rinfo = &req->r_reply_info;
-        err = parse_reply_info(msg, rinfo);
+        err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
        mutex_unlock(&mdsc->mutex);
        mutex_lock(&session->s_mutex);
        if (err < 0) {
-                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
                ceph_msg_dump(msg);
                goto out_err;
        }
@@ -2092,7 +2146,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
-                if (result == 0 && rinfo->dir_nr)
+                if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+                    rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c7235..4e3a9cc0bba 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
 */
 struct ceph_mds_reply_info_in {
        struct ceph_mds_reply_inode *in;
+        struct ceph_dir_layout dir_layout;
        u32 symlink_len;
        char *symlink;
        u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
 };
 /*
- * parsed info about an mds reply, including information about the
+ * parsed info about an mds reply, including information about
- * target inode and/or its parent directory and dentry, and directory
+ * either: 1) the target inode and/or its parent directory and dentry,
- * contents (for readdir results).
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
 */
 struct ceph_mds_reply_info_parsed {
        struct ceph_mds_reply_head    *head;
+        /* trace */
        struct ceph_mds_reply_info_in diri, targeti;
        struct ceph_mds_reply_dirfrag *dirfrag;
        char                          *dname;
        u32                           dname_len;
        struct ceph_mds_reply_lease   *dlease;
-        struct ceph_mds_reply_dirfrag *dir_dir;
+        /* extra */
-        int                           dir_nr;
+        union {
-        char                          **dir_dname;
+                /* for fcntl F_GETLK results */
-        u32                           *dir_dname_len;
+                struct ceph_filelock *filelock_reply;
-        struct ceph_mds_reply_lease   **dir_dlease;
-        struct ceph_mds_reply_info_in *dir_in;
+                /* for readdir results */
-        u8                            dir_complete, dir_end;
+                struct {
+                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        int                           dir_nr;
+                        char                          **dir_dname;
+                        u32                           *dir_dname_len;
+                        struct ceph_mds_reply_lease   **dir_dlease;
+                        struct ceph_mds_reply_info_in *dir_in;
+                        u8                            dir_complete, dir_end;
+                };
+        };
        /* encoded blob describing snapshot contexts for certain
           operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
-        int r_mds;
        /* operation on what? */
        struct inode *r_inode;              /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
+        uid_t r_uid;
+        gid_t r_gid;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae053..9c5085465a6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
@@ -428,7 +430,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-        fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+        fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_DIRLAYOUTHASH;
        fsc->client->monc.want_mdsmap = 1;
        fsc->mount_options = fsopt;
@@ -443,13 +446,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail_client;
        err = -ENOMEM;
-        fsc->wb_wq = create_workqueue("ceph-writeback");
+        /*
+         * The number of concurrent works can be high but they don't need
+         * to be processed in parallel, limit concurrency.
+         */
+        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
        if (fsc->wb_wq == NULL)
                goto fail_bdi;
-        fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
        if (fsc->pg_inv_wq == NULL)
                goto fail_wb_wq;
-        fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
        if (fsc->trunc_wq == NULL)
                goto fail_pg_inv_wq;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f..20b907d76ae 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
        unsigned i_ceph_flags;
        unsigned long i_release_count;
+        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
        char *i_symlink;
@@ -293,9 +294,7 @@ struct ceph_inode_info {
        int i_rd_ref, i_rdcache_ref, i_wr_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
-        u32 i_rdcache_gen;      /* we increment this each time we get
+        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
-                                   FILE_CACHE.  If it's non-zero, we
-                                   _may_ have cached pages. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
        struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -667,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat);
@@ -770,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 /*
 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f7..8c9eba6ef9d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct ceph_inode_xattr *xattr = NULL;
+        int name_len = strlen(name);
        int c;
        p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
                parent = *p;
                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c == 0 && name_len > xattr->name_len)
+                        c = 1;
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b9..dca9e5e0f73 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 /* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
        return major % CHRDEV_MAJOR_HASH_SIZE;
 }
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
-int cdev_index(struct inode *inode)
-{
-        int idx;
-        struct kobject *kobj;
-        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-        if (!kobj)
-                return -1;
-        kobject_put(kobj);
-        return idx;
-}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ce..ee45648b0d1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
        select NLS
        select CRYPTO
        select CRYPTO_MD5
+        select CRYPTO_HMAC
        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
            to be cached locally on disk through the general filesystem cache
            manager. If unsure, say N.
+config CIFS_ACL
+          bool "Provide CIFS ACL support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL && CIFS_XATTR
+          help
+            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
+            is handed over to the application/caller.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bd..d87558448e3 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,8 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+          cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-          readdir.o ioctl.o sess.o export.o cifsacl.o
+          readdir.o ioctl.o sess.o export.o
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d103654..fe168359082 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
  wsize         default write size (default 57344)
                maximum wsize currently allowed by CIFS is 57344 (fourteen
                4096 byte pages)
+  actimeo=n     attribute cache timeout in seconds (default 1 second).
+                After this timeout, the cifs client requests fresh attribute
+                information from the server. This option allows to tune the
+                attribute cache timeout to suit the workload needs. Shorter
+                timeouts mean better the cache coherency, but increased number
+                of calls to the server. Longer timeouts mean reduced number
+                of calls to the server at the expense of less stricter cache
+                coherency checks (i.e. incorrect attribute cache for a short
+                period of time).
  rw            mount the network share read-write (note that the
                server may still consider the share read-only)
  ro            mount network share read-only
@@ -443,6 +452,11 @@ A partial list of the supported mount options follows:
                if oplock (caching token) is granted and held. Note that
                direct allows write operations larger than page size
                to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+                client read from the cache all the time it has Oplock Level II,
+                otherwise - read from the server. All written data are stored
+                in the cache, but if the client doesn't have Exclusive Oplock,
+                it writes the data to the server.
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e5..355abcdcda9 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 v) mount check for unmatched uids
-w) Add support for new vfs entry points for setlease and fallocate 
+w) Add support for new vfs entry point for fallocate
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fc..e654dfd092c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                                   void *buffer, uint16_t maxbuf)
 {
        const struct TCP_Server_Info *server = cookie_netfs_data;
-        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+        const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+        const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
        struct cifs_server_key *key = buffer;
        uint16_t key_len = sizeof(struct cifs_server_key);
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
         */
        switch (sa->sa_family) {
        case AF_INET:
-                key->family = server->addr.sockAddr.sin_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr.sin_port;
+                key->port = addr->sin_port;
-                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key->addr[0].ipv4_addr = addr->sin_addr;
                key_len += sizeof(key->addr[0].ipv4_addr);
                break;
        case AF_INET6:
-                key->family = server->addr.sockAddr6.sin6_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr6.sin6_port;
+                key->port = addr6->sin6_port;
-                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key->addr[0].ipv6_addr = addr6->sin6_addr;
                key_len += sizeof(key->addr[0].ipv6_addr);
                break;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b..65829d32128 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
-                        mid_entry->tsk,
+                        mid_entry->callback_data,
                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        seq_printf(m, "Features: ");
+        seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        seq_printf(m, "dfs");
+        seq_printf(m, " dfs");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-        seq_printf(m, "fscache");
+        seq_printf(m, " fscache");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-        seq_printf(m, "lanman");
+        seq_printf(m, " lanman");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_POSIX
-        seq_printf(m, "posix");
+        seq_printf(m, " posix");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-        seq_printf(m, "spnego");
+        seq_printf(m, " spnego");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_XATTR
-        seq_printf(m, "xattr");
+        seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+        seq_printf(m, " acl");
 #endif
        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                mid_entry = list_entry(tmp3, struct mid_q_entry,
                                        qhead);
                                seq_printf(m, "\tState: %d com: %d pid:"
-                                                " %d tsk: %p mid %d\n",
+                                                " %d cbdata: %p mid %d\n",
                                                mid_entry->midState,
                                                (int)mid_entry->command,
                                                mid_entry->pid,
-                                                mid_entry->tsk,
+                                                mid_entry->callback_data,
                                                mid_entry->mid);
                        }
                        spin_unlock(&GlobalMid_Lock);
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                atomic_read(&totSmBufAllocCount));
 #endif /* CONFIG_CIFS_STATS2 */
-        seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+        seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
        seq_printf(m,
                "\n%d session %d share reconnects\n",
                tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27f..f1c68629f27 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 }
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
-                                struct list_head *mntlist)
-{
-        /* stolen from afs code */
-        int err;
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&cifs_dfs_automount_task,
-                                      cifs_dfs_mountpoint_expiry_timeout);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        return err;
-}
 static void dump_referral(const struct dfs_info3_param *ref)
 {
        cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,42 @@ static void dump_referral(const struct dfs_info3_param *ref)
                                ref->path_consumed);
 }
+/*
-static void*
+ * Create a vfsmount that we can automount
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
        struct cifs_sb_info *cifs_sb;
        struct cifsSesInfo *ses;
-        char *full_path = NULL;
+        char *full_path;
        int xid, i;
-        int rc = 0;
+        int rc;
-        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct vfsmount *mnt;
        struct tcon_link *tlink;
        cFYI(1, "in %s", __func__);
-        BUG_ON(IS_ROOT(dentry));
+        BUG_ON(IS_ROOT(mntpt));
        xid = GetXid();
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
        /*
         * The MSDFS spec states that paths in DFS referral requests and
         * responses must be prefixed by a single '\' character instead of
         * the double backslashes usually used in the UNC. This function
         * gives us the latter, so we must adjust the result.
         */
-        full_path = build_path_from_dentry(dentry);
+        mnt = ERR_PTR(-ENOMEM);
-        if (full_path == NULL) {
+        full_path = build_path_from_dentry(mntpt);
-                rc = -ENOMEM;
+        if (full_path == NULL)
-                goto out_err;
+                goto free_xid;
-        }
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink)) {
-                rc = PTR_ERR(tlink);
+                mnt = ERR_CAST(tlink);
-                goto out_err;
+                goto free_full_path;
        }
        ses = tlink_tcon(tlink)->ses;
@@ -341,46 +309,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        cifs_put_tlink(tlink);
+        mnt = ERR_PTR(-ENOENT);
        for (i = 0; i < num_referrals; i++) {
                int len;
-                dump_referral(referrals+i);
+                dump_referral(referrals + i);
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
                        cERROR(1, "%s: Net Address path too short: %s",
                                        __func__, referrals[i].node_name);
-                        rc = -EINVAL;
+                        mnt = ERR_PTR(-EINVAL);
-                        goto out_err;
+                        break;
                }
                mnt = cifs_dfs_do_refmount(cifs_sb,
                                full_path, referrals + i);
                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
                                        referrals[i].node_name, mnt);
-                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
-                        break;
+                        goto success;
        }
-        /* we need it cause for() above could exit without valid submount */
+        /* no valid submounts were found; return error from get_dfs_path() by
-        rc = PTR_ERR(mnt);
+         * preference */
-        if (IS_ERR(mnt))
+        if (rc != 0)
-                goto out_err;
+                mnt = ERR_PTR(rc);
-        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
-out:
+success:
-        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
+free_full_path:
        kfree(full_path);
+free_xid:
+        FreeXid(xid);
        cFYI(1, "leaving %s" , __func__);
-        return ERR_PTR(rc);
+        return mnt;
-out_err:
+}
-        path_put(&nd->path);
-        goto out;
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+        struct vfsmount *newmnt;
+        cFYI(1, "in %s", __func__);
+        newmnt = cifs_dfs_do_automount(path->dentry);
+        if (IS_ERR(newmnt)) {
+                cFYI(1, "leaving %s [automount failed]" , __func__);
+                return newmnt;
+        }
+        mntget(newmnt); /* prevent immediate expiration */
+        mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+        schedule_delayed_work(&cifs_dfs_automount_task,
+                              cifs_dfs_mountpoint_expiry_timeout);
+        cFYI(1, "leaving %s [ok]" , __func__);
+        return newmnt;
 }
 const struct inode_operations cifs_dfs_referral_inode_operations = {
-        .follow_link = cifs_dfs_follow_mountpoint,
 };
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a410..ac51cd2d33a 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
-#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
@@ -40,14 +40,16 @@
 #define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
 #define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
 struct cifs_sb_info {
-        struct radix_tree_root tlink_tree;
+        struct rb_root tlink_tree;
-#define CIFS_TLINK_MASTER_TAG           0       /* is "master" (mount) tcon */
        spinlock_t tlink_tree_lock;
+        struct tcon_link *master_tlink;
        struct nls_table *local_nls;
        unsigned int rsize;
        unsigned int wsize;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1..4dfba828316 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        char *description, *dp;
        size_t desc_len;
        struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* add the server address */
-        if (server->addr.sockAddr.sin_family == AF_INET)
+        if (server->dstaddr.ss_family == AF_INET)
-                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
+                sprintf(dp, "ip4=%pI4", &sa->sin_addr);
-        else if (server->addr.sockAddr.sin_family == AF_INET6)
+        else if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a172..fc0fd4fde30 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int charlen, outlen = 0;
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
-        for (i = 0; i < maxwords && from[i]; i++) {
+        for (i = 0; i < maxwords; i++) {
-                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
+                ftmp = get_unaligned_le16(&from[i]);
-                                             NLS_MAX_CHARSET_SIZE);
+                if (ftmp == 0)
+                        break;
+                charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
                        outlen += charlen;
                else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 }
 /*
- * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
 * @target - where converted character should be copied
- * @src_char - 2 byte little-endian source character
+ * @src_char - 2 byte host-endian source character
 * @cp - codepage to which character should be converted
 * @mapchar - should character be mapped according to mapchars mount option?
 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
 */
 static int
-cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
             bool mapchar)
 {
        int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
         *     build_path_from_dentry are modified, as they use slash as
         *     separator.
         */
-        switch (le16_to_cpu(src_char)) {
+        switch (src_char) {
        case UNI_COLON:
                *target = ':';
                break;
@@ -109,8 +113,7 @@ out:
        return len;
 cp_convert:
-        len = cp->uni2char(le16_to_cpu(src_char), target,
+        len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-                           NLS_MAX_CHARSET_SIZE);
        if (len <= 0) {
                *target = '?';
                len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
        int nullsize = nls_nullsize(codepage);
        int fromwords = fromlen / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
        /*
         * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
         */
        safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
-        for (i = 0; i < fromwords && from[i]; i++) {
+        for (i = 0; i < fromwords; i++) {
+                ftmp = get_unaligned_le16(&from[i]);
+                if (ftmp == 0)
+                        break;
                /*
                 * check to see if converting this character might make the
                 * conversion bleed into the null terminator
                 */
                if (outlen >= safelen) {
-                        charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+                        charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
                        if ((outlen + charlen) > (tolen - nullsize))
                                break;
                }
                /* put converted char into 'to' buffer */
-                charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+                charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
                outlen += charlen;
        }
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 {
        int charlen;
        int i;
-        wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */
+        wchar_t wchar_to; /* needed to quiet sparse */
        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+                charlen = codepage->char2uni(from, len, &wchar_to);
-                /* works for 2.4.0 kernel or later */
-                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
+                        cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
-                                (int)*from, charlen);
+                                *from, charlen);
                        /* A question mark */
-                        to[i] = cpu_to_le16(0x003f);
+                        wchar_to = 0x003f;
                        charlen = 1;
-                } else
+                }
-                        to[i] = cpu_to_le16(wchar_to[i]);
+                put_unaligned_le16(wchar_to, &to[i]);
        }
-        to[i] = 0;
+        put_unaligned_le16(0, &to[i]);
        return i;
 }
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
        return dst;
 }
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+                 const struct nls_table *cp, int mapChars)
+{
+        int i, j, charlen;
+        int len_remaining = maxlen;
+        char src_char;
+        __u16 temp;
+        if (!mapChars)
+                return cifs_strtoUCS(target, source, PATH_MAX, cp);
+        for (i = 0, j = 0; i < maxlen; j++) {
+                src_char = source[i];
+                switch (src_char) {
+                case 0:
+                        put_unaligned_le16(0, &target[j]);
+                        goto ctoUCS_out;
+                case ':':
+                        temp = UNI_COLON;
+                        break;
+                case '*':
+                        temp = UNI_ASTERIK;
+                        break;
+                case '?':
+                        temp = UNI_QUESTION;
+                        break;
+                case '<':
+                        temp = UNI_LESSTHAN;
+                        break;
+                case '>':
+                        temp = UNI_GRTRTHAN;
+                        break;
+                case '|':
+                        temp = UNI_PIPE;
+                        break;
+                /*
+                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+                 * until all the calls to build_path_from_dentry are modified,
+                 * as they use backslash as separator.
+                 */
+                default:
+                        charlen = cp->char2uni(source+i, len_remaining,
+                                                &temp);
+                        /*
+                         * if no match, use question mark, which at least in
+                         * some cases serves as wild card
+                         */
+                        if (charlen < 1) {
+                                temp = 0x003f;
+                                charlen = 1;
+                        }
+                        len_remaining -= charlen;
+                        /*
+                         * character may take more than one byte in the source
+                         * string, but will take exactly two bytes in the
+                         * target string
+                         */
+                        i += charlen;
+                        continue;
+                }
+                put_unaligned_le16(temp, &target[j]);
+                i++; /* move to next char in source string */
+                len_remaining--;
+        }
+ctoUCS_out:
+        return i;
+}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae82..1e7636b145a 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
 #include "cifs_debug.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -43,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 ;
-/* security id for everyone */
+/* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+        1, 1, {0, 0, 0, 0, 0, 5}, {11} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -367,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        if (num_aces  > 0) {
                umode_t user_mask = S_IRWXU;
                umode_t group_mask = S_IRWXG;
-                umode_t other_mask = S_IRWXO;
+                umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
@@ -392,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &other_mask);
+                        if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+                                access_flags_to_mode(ppace[i]->access_req,
+                                                     ppace[i]->type,
+                                                     &fattr->cf_mode,
+                                                     &other_mask);
 /*                      memcpy((void *)(&(cifscred->aces[i])),
                                (void *)ppace[i],
@@ -560,7 +567,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        xid = GetXid();
        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +575,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        cifs_put_tlink(tlink);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
@@ -583,7 +592,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        tcon = tlink_tcon(tlink);
        xid = GetXid();
@@ -591,23 +600,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (!rc) {
-                cERROR(1, "Unable to open file to get ACL");
+                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-                goto out;
+                CIFSSMBClose(xid, tcon, fid);
        }
-        rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-        CIFSSMBClose(xid, tcon, fid);
- out:
        cifs_put_tlink(tlink);
        FreeXid(xid);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
 /* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                                      struct inode *inode, const char *path,
                                      u32 *pacllen)
 {
@@ -695,7 +703,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void
+int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                  struct inode *inode, const char *path, const __u16 *pfid)
 {
@@ -711,17 +719,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
-        if (pntsd)
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                rc = parse_sec_desc(pntsd, acllen, fattr);
-        if (rc)
+                kfree(pntsd);
-                cFYI(1, "parse sec desc failed rc = %d", rc);
+                if (rc)
+                        cERROR(1, "parse sec desc failed rc = %d", rc);
+        }
-        kfree(pntsd);
+        return rc;
-        return;
 }
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 {
        int rc = 0;
        __u32 secdesclen = 0;
@@ -736,7 +748,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
-        if (pntsd) {
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                /* allocate memory for the smb header,
                   set security descriptor request security descriptor
                   parameters, and secuirty descriptor itself */
@@ -766,4 +781,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf515..c4ae7d03656 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
        char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int match_sid(struct cifs_sid *);
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-#endif /*  CONFIG_CIFS_EXPERIMENTAL */
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161a..0db5f1de022 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
 #include "ntlmssp.h"
@@ -37,11 +36,6 @@
 /* Note that the smb header signature field on input contains the
        sequence number before this function is called */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
                                struct TCP_Server_Info *server, char *signature)
 {
@@ -72,6 +66,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        return 0;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -84,14 +79,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
        if (rc)
@@ -149,6 +142,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        return rc;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -162,14 +156,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
        if (rc)
@@ -236,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
 int setup_ntlm_response(struct cifsSesInfo *ses)
 {
+        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
        char temp_key[CIFS_SESS_KEY_SIZE];
@@ -249,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
        }
        ses->auth_key.len = temp_len;
-        SMBNTencrypt(ses->password, ses->server->cryptkey,
+        rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        if (rc) {
+                cFYI(1, "%s Can't generate NTLM response, error: %d",
+                        __func__, rc);
+                return rc;
+        }
+        rc = E_md4hash(ses->password, temp_key);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
-        E_md4hash(ses->password, temp_key);
+        rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
-        mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+        if (rc)
+                cFYI(1, "%s Can't generate NTLM session key, error: %d",
+                        __func__, rc);
-        return 0;
+        return rc;
 }
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -702,14 +708,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
        unsigned int size;
        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-        if (!server->secmech.hmacmd5 ||
+        if (IS_ERR(server->secmech.hmacmd5)) {
-                        IS_ERR(server->secmech.hmacmd5)) {
                cERROR(1, "could not allocate crypto hmacmd5\n");
                return PTR_ERR(server->secmech.hmacmd5);
        }
        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-        if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+        if (IS_ERR(server->secmech.md5)) {
                cERROR(1, "could not allocate crypto md5\n");
                rc = PTR_ERR(server->secmech.md5);
                goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec00647..00000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *   fs/cifs/cifsencrypt.h
- *
- *   Copyright (c) International Business Machines  Corp., 2005
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   Externs for misc. small encryption routines
- *   so we do not have to put them in cifsproto.h
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-/* md4.c */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-/* smbdes.c */
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
-                  unsigned char *p24);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa7958..f2970136d17 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 50 Range: 2 to 256");
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+                               "reconnecting server. Default: 5. 0 means "
+                               "never reconnect.");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -116,7 +120,7 @@ cifs_read_super(struct super_block *sb, void *data,
                return -ENOMEM;
        spin_lock_init(&cifs_sb->tlink_tree_lock);
-        INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
        if (rc) {
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
                goto out_no_root;
        }
+        /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+        if (cifs_sb_master_tcon(cifs_sb)->nocase)
+                sb->s_d_op = &cifs_ci_dentry_ops;
+        else
+                sb->s_d_op = &cifs_dentry_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cFYI(1, "export ops supported");
@@ -283,10 +293,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct cifs_sb_info *cifs_sb;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +311,7 @@ static int cifs_permission(struct inode *inode, int mask)
                on the client (above and beyond ACL on servers) for
                servers which do not support setting and viewing mode bits,
                so allowing client to check permissions is useful */
-                return generic_permission(inode, mask, NULL);
+                return generic_permission(inode, mask, flags, NULL);
 }
 static struct kmem_cache *cifs_inode_cachep;
@@ -321,12 +334,13 @@ cifs_alloc_inode(struct super_block *sb)
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
-        cifs_inode->clientCanCacheRead = false;
+        cifs_set_oplock_level(cifs_inode, 0);
-        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
+        cifs_inode->uniqueid = 0;
+        cifs_inode->createtime = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -335,10 +349,17 @@ cifs_alloc_inode(struct super_block *sb)
        return &cifs_inode->vfs_inode;
 }
+static void cifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+        call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 static void
@@ -352,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        seq_printf(s, ",addr=");
-        switch (server->addr.sockAddr.sin_family) {
+        switch (server->dstaddr.ss_family) {
        case AF_INET:
-                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
                break;
        case AF_INET6:
-                seq_printf(s, "%pI6",
+                seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
-                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (sa6->sin6_scope_id)
-                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
-                        seq_printf(s, "%%%u",
-                                   server->addr.sockAddr6.sin6_scope_id);
                break;
        default:
                seq_printf(s, "(unknown)");
@@ -459,9 +481,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",acl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
                seq_printf(s, ",mfsymlinks");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+                seq_printf(s, ",fsc");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+        /* convert actimeo and display it in seconds */
+                seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
        return 0;
 }
@@ -574,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        ssize_t written;
+        int rc;
        written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (!CIFS_I(inode)->clientCanCacheAll)
-                filemap_fdatawrite(inode->i_mapping);
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return written;
+        rc = filemap_fdatawrite(inode->i_mapping);
+        if (rc)
+                cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
        return written;
 }
@@ -707,6 +740,25 @@ const struct file_operations cifs_file_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_strict_writev,
+        .open = cifs_open,
+        .release = cifs_close,
+        .lock = cifs_lock,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_ops = {
        /* no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
@@ -725,6 +777,7 @@ const struct file_operations cifs_file_direct_ops = {
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
        .write = do_sync_write,
@@ -743,6 +796,24 @@ const struct file_operations cifs_file_nobrl_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_nobrl_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_strict_writev,
+        .open = cifs_open,
+        .release = cifs_close,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_nobrl_ops = {
        /* no mmap, no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
@@ -934,7 +1005,6 @@ init_cifs(void)
        GlobalCurrentXid = 0;
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
-        memset(Local_System_Name, 0, 15);
        spin_lock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&cifs_file_list_lock);
        spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b..14789a97304 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
+extern void cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -72,19 +73,27 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
-extern const struct file_operations cifs_file_nobrl_ops;
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
-extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-                         size_t read_size, loff_t *poffset);
+                              size_t read_size, loff_t *poffset);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                                 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-                         size_t write_size, loff_t *poffset);
+                               size_t write_size, loff_t *poffset);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +102,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern const struct dentry_operations cifs_dentry_ops;
 extern const struct dentry_operations cifs_ci_dentry_ops;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
 extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.68"
+#define CIFS_VERSION   "1.69"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612..edd5b29b53c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
 #define CIFS_MIN_RCV_POOL 4
 /*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+/*
 * MAX_REQ is the maximum number of requests that WE will send
 * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
@@ -151,35 +161,27 @@ struct TCP_Server_Info {
        int srv_count; /* reference counter */
        /* 15 character server name + 0x20 16th byte indicating type = srv */
        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+        enum statusEnum tcpStatus; /* what we think the status is */
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
-        union {
+        struct sockaddr_storage dstaddr;
-                struct sockaddr_in sockAddr;
-                struct sockaddr_in6 sockAddr6;
-        } addr;
        struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+        struct net *net;
+#endif
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
        struct list_head pending_mid_q;
-        void *Server_NlsInfo;   /* BB - placeholder for future NLS info  */
-        unsigned short server_codepage; /* codepage for the server    */
-        enum protocolEnum protocolType;
-        char versionMajor;
-        char versionMinor;
-        bool svlocal:1;                 /* local server or remote */
        bool noblocksnd;                /* use blocking sendmsg */
        bool noautotune;                /* do not autotune send buf sizes */
        bool tcp_nodelay;
        atomic_t inFlight;  /* number of requests on the wire to server */
-#ifdef CONFIG_CIFS_STATS2
-        atomic_t inSend; /* requests trying to send */
-        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
-#endif
-        enum statusEnum tcpStatus; /* what we think the status is */
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
        char secMode;
+        bool session_estab; /* mark when very first sess is established */
+        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
        unsigned int maxReq;    /* Clients should submit no more */
        /* than maxReq distinct unanswered SMBs to the server when using  */
@@ -192,31 +194,62 @@ struct TCP_Server_Info {
        unsigned int max_vcs;   /* maximum number of smb sessions, at least
                                   those that can be specified uniquely with
                                   vcnumbers */
-        char sessid[4];         /* unique token id for this session */
-        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        __u32 sequence_number; /* needed for CIFS PDU signature */
+        __u32 sequence_number; /* for signing, protected by srv_mutex */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
-        u16 dialect; /* dialect index that server chose */
        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
        /* extended security flavors that server supports */
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
-        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
+        struct delayed_work     echo; /* echo ping workqueue job */
-        bool    sec_ntlmssp;            /* supports NTLMSSP */
-        bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
+#ifdef CONFIG_CIFS_STATS2
+        atomic_t inSend; /* requests trying to send */
+        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
 };
 /*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+#ifdef CONFIG_NET_NS
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return srv->net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+        srv->net = net;
+}
+#else
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return &init_net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+#endif
+/*
 * Session structure.  One of these for each uid session with a particular host
 */
 struct cifsSesInfo {
@@ -336,7 +369,8 @@ struct cifsTconInfo {
 * "get" on the container.
 */
 struct tcon_link {
-        unsigned long           tl_index;
+        struct rb_node          tl_rbnode;
+        uid_t                   tl_uid;
        unsigned long           tl_flags;
 #define TCON_LINK_MASTER        0
 #define TCON_LINK_PENDING       1
@@ -438,13 +472,14 @@ struct cifsInodeInfo {
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-        unsigned long time;     /* jiffies of last update/check of inode */
+        bool clientCanCacheRead;        /* read oplock */
-        bool clientCanCacheRead:1;      /* read oplock */
+        bool clientCanCacheAll;         /* read and writebehind oplock */
-        bool clientCanCacheAll:1;       /* read and writebehind oplock */
+        bool delete_pending;            /* DELETE_ON_CLOSE is set */
-        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        bool invalid_mapping;           /* pagecache is invalid */
-        bool invalid_mapping:1;         /* pagecache is invalid */
+        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+        u64  createtime;                /* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie *fscache;
 #endif
@@ -499,6 +534,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 #endif
+struct mid_q_entry;
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
        struct list_head qhead; /* mids waiting on reply from this server */
@@ -510,7 +557,8 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-        struct task_struct *tsk;        /* task waiting for response */
+        mid_callback_t *callback; /* call completion callback */
+        void *callback_data;      /* general purpose pointer for callback */
        struct smb_hdr *resp_buf;       /* response buffer */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
@@ -565,6 +613,7 @@ struct cifs_fattr {
        u64             cf_uniqueid;
        u64             cf_eof;
        u64             cf_bytes;
+        u64             cf_createtime;
        uid_t           cf_uid;
        gid_t           cf_gid;
        umode_t         cf_mode;
@@ -612,12 +661,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_IOVEC            4    /* array of response buffers */
 /* Type of Request to SendReceive2 */
-#define   CIFS_STD_OP           0    /* normal request timeout */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
-#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
-#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
-#define   CIFS_BLOCKING_OP      4    /* operation can block */
-#define   CIFS_ASYNC_OP         8    /* do not wait for response */
-#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
@@ -745,8 +791,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;  /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
                                          /* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
 /*
 *  Global counters, updated atomically
 */
@@ -782,6 +826,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a..b5c8cc5d7a7 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 #include <net/sock.h>
+#include <asm/unaligned.h>
 #include "smbfsctl.h"
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
 #define SMB_COM_SETATTR               0x09 /* trivial response */
 #define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
 #define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
 #define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
 #define SMB_COM_READ_ANDX             0x2E
 #define SMB_COM_WRITE_ANDX            0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
        __u16 Mid;
        __u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+                         (2 * (smb_var)->WordCount))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+/* get the converted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        return get_unaligned(bc_ptr);
+}
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc_le(struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        return get_unaligned_le16(bc_ptr);
+}
+/* set the ByteCount for a SMB packet in host-byte order */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        put_unaligned(count, bc_ptr);
+}
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc_le(__u16 count, struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        put_unaligned_le16(count, bc_ptr);
+}
 /*
 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
 *
 */
+typedef struct smb_com_echo_req {
+        struct  smb_hdr hdr;
+        __le16  EchoCount;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_REQ;
+typedef struct smb_com_echo_rsp {
+        struct  smb_hdr hdr;
+        __le16  SequenceNumber;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_RSP;
 typedef struct smb_com_logoff_andx_req {
        struct smb_hdr hdr;     /* wct = 2 */
        __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf..8096f27ad9a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,12 +54,19 @@ do {								\
             __func__, curr_xid, (int)rc);                      \
 } while (0)
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                        struct cifsTconInfo *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
                char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+                                        struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+                           struct smb_hdr *in_buf, mid_callback_t *callback,
+                           void *cbdata);
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
@@ -78,10 +85,10 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+                            unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,6 +111,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
                                      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
 extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
                                struct file *file, struct tcon_link *tlink,
@@ -129,10 +137,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
                              const char *path, const __u16 *pfid);
-extern int mode_to_acl(struct inode *inode, const char *path, __u64);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+                                        const char *, u32 *);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
                        const char *);
@@ -345,12 +355,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 netfid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
-                        const bool waitFlag);
+                        const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 smb_file_id, const int get_flag,
                        const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 extern struct cifsSesInfo *sesInfoAlloc(void);
@@ -364,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -414,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
 extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                const unsigned char *path,
                struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+                        unsigned char *p24);
+extern void E_P16(unsigned char *p14, unsigned char *p16);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+                        unsigned char *p24);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5..3106f5e5c63 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
 static int validate_t2(struct smb_t2_rsp *pSMB)
 {
-        int rc = -EINVAL;
+        unsigned int total_size;
-        int total_size;
-        char *pBCC;
+        /* check for plausible wct */
+        if (pSMB->hdr.WordCount < 10)
+                goto vt2_err;
-        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
-        if (pSMB->hdr.WordCount >= 10) {
+        if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
-                if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) &&
+            get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
-                   (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) {
+                goto vt2_err;
-                        /* check that bcc is at least as big as parms + data */
-                        /* check that bcc is less than negotiated smb buffer */
+        /* check that bcc is at least as big as parms + data */
-                        total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount);
+        /* check that bcc is less than negotiated smb buffer */
-                        if (total_size < 512) {
+        total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
-                                total_size +=
+        if (total_size >= 512)
-                                        le16_to_cpu(pSMB->t2_rsp.DataCount);
+                goto vt2_err;
-                                /* BCC le converted in SendReceive */
-                                pBCC = (pSMB->hdr.WordCount * 2) +
+        total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
-                                        sizeof(struct smb_hdr) +
+        if (total_size > get_bcc(&pSMB->hdr) ||
-                                        (char *)pSMB;
+            total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
-                                if ((total_size <= (*(u16 *)pBCC)) &&
+                goto vt2_err;
-                                   (total_size <
-                                        CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) {
+        return 0;
-                                        return 0;
+vt2_err:
-                                }
-                        }
-                }
-        }
        cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
                sizeof(struct smb_t2_rsp) + 16);
-        return rc;
+        return -EINVAL;
 }
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        }
+        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
-#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
-        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
@@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        return rc;
 }
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+        struct TCP_Server_Info *server = mid->callback_data;
+        DeleteMidQEntry(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+}
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+        ECHO_REQ *smb;
+        int rc = 0;
+        cFYI(1, "In echo request");
+        rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+        if (rc)
+                return rc;
+        /* set up echo request */
+        smb->hdr.Tid = cpu_to_le16(0xffff);
+        smb->hdr.WordCount = 1;
+        put_unaligned_le16(1, &smb->EchoCount);
+        put_bcc_le(1, &smb->hdr);
+        smb->Data[0] = 'a';
+        smb->hdr.smb_buf_length += 3;
+        rc = cifs_call_async(server, (struct smb_hdr *)smb,
+                                cifs_echo_callback, server);
+        if (rc)
+                cFYI(1, "Echo request failed: %d", rc);
+        cifs_small_buf_release(smb);
+        return rc;
+}
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
@@ -1196,7 +1236,7 @@ OldOpenRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1309,7 +1349,7 @@ openRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        iov[0].iov_base = (char *)pSMB;
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-                         &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+                         &resp_buf_type, CIFS_LOG_ERROR);
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
@@ -1666,7 +1706,8 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
            const __u16 smb_file_id, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
-            const __u32 numLock, const __u8 lockType, const bool waitFlag)
+            const __u32 numLock, const __u8 lockType,
+            const bool waitFlag, const __u8 oplock_level)
 {
        int rc = 0;
        LOCK_REQ *pSMB = NULL;
@@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        pSMB->NumberOfLocks = cpu_to_le16(numLock);
        pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
        pSMB->LockType = lockType;
+        pSMB->OplockLevel = oplock_level;
        pSMB->AndXCommand = 0xFF;       /* none */
        pSMB->Fid = smb_file_id; /* netfid stays le */
@@ -2478,95 +2520,6 @@ querySymLinkRetry:
 }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
-   This assumes that all NT TRANSACTS that we init here have
-   total parm and data under about 400 bytes (to fit in small cifs
-   buffer size), which is the case so far, it easily fits. NB:
-        Setup words themselves and ByteCount
-        MaxSetupCount (size of returned setup area) and
-        MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
-                   void **ret_buf)
-{
-        int rc;
-        __u32 temp_offset;
-        struct smb_com_ntransact_req *pSMB;
-        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
-                                (void **)&pSMB);
-        if (rc)
-                return rc;
-        *ret_buf = (void *)pSMB;
-        pSMB->Reserved = 0;
-        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
-        pSMB->TotalDataCount  = 0;
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->DataCount  = pSMB->TotalDataCount;
-        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
-                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
-        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
-        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
-        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
-        pSMB->SubCommand = cpu_to_le16(sub_command);
-        return 0;
-}
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
-                   __u32 *pparmlen, __u32 *pdatalen)
-{
-        char *end_of_smb;
-        __u32 data_count, data_offset, parm_count, parm_offset;
-        struct smb_com_ntransact_rsp *pSMBr;
-        *pdatalen = 0;
-        *pparmlen = 0;
-        if (buf == NULL)
-                return -EINVAL;
-        pSMBr = (struct smb_com_ntransact_rsp *)buf;
-        /* ByteCount was converted from little endian in SendReceive */
-        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
-                        (char *)&pSMBr->ByteCount;
-        data_offset = le32_to_cpu(pSMBr->DataOffset);
-        data_count = le32_to_cpu(pSMBr->DataCount);
-        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
-        parm_count = le32_to_cpu(pSMBr->ParameterCount);
-        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
-        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-        /* should we also check that parm and data areas do not overlap? */
-        if (*ppparm > end_of_smb) {
-                cFYI(1, "parms start after end of smb");
-                return -EINVAL;
-        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, "parm end after end of smb");
-                return -EINVAL;
-        } else if (*ppdata > end_of_smb) {
-                cFYI(1, "data starts after end of smb");
-                return -EINVAL;
-        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr);
-                return -EINVAL;
-        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, "parm count and data count larger than SMB");
-                return -EINVAL;
-        }
-        *pdatalen = data_count;
-        *pparmlen = parm_count;
-        return 0;
-}
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -3056,7 +3009,97 @@ GetExtAttrOut:
 #endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+                   const int parm_len, struct cifsTconInfo *tcon,
+                   void **ret_buf)
+{
+        int rc;
+        __u32 temp_offset;
+        struct smb_com_ntransact_req *pSMB;
+        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+                                (void **)&pSMB);
+        if (rc)
+                return rc;
+        *ret_buf = (void *)pSMB;
+        pSMB->Reserved = 0;
+        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+        pSMB->TotalDataCount  = 0;
+        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+        pSMB->ParameterCount = pSMB->TotalParameterCount;
+        pSMB->DataCount  = pSMB->TotalDataCount;
+        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
+        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+        pSMB->SubCommand = cpu_to_le16(sub_command);
+        return 0;
+}
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+                   __u32 *pparmlen, __u32 *pdatalen)
+{
+        char *end_of_smb;
+        __u32 data_count, data_offset, parm_count, parm_offset;
+        struct smb_com_ntransact_rsp *pSMBr;
+        *pdatalen = 0;
+        *pparmlen = 0;
+        if (buf == NULL)
+                return -EINVAL;
+        pSMBr = (struct smb_com_ntransact_rsp *)buf;
+        /* ByteCount was converted from little endian in SendReceive */
+        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+                        (char *)&pSMBr->ByteCount;
+        data_offset = le32_to_cpu(pSMBr->DataOffset);
+        data_count = le32_to_cpu(pSMBr->DataCount);
+        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+        parm_count = le32_to_cpu(pSMBr->ParameterCount);
+        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+        /* should we also check that parm and data areas do not overlap? */
+        if (*ppparm > end_of_smb) {
+                cFYI(1, "parms start after end of smb");
+                return -EINVAL;
+        } else if (parm_count + *ppparm > end_of_smb) {
+                cFYI(1, "parm end after end of smb");
+                return -EINVAL;
+        } else if (*ppdata > end_of_smb) {
+                cFYI(1, "data starts after end of smb");
+                return -EINVAL;
+        } else if (data_count + *ppdata > end_of_smb) {
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+                        *ppdata, data_count, (data_count + *ppdata),
+                        end_of_smb, pSMBr);
+                return -EINVAL;
+        } else if (parm_count + data_count > pSMBr->ByteCount) {
+                cFYI(1, "parm count and data count larger than SMB");
+                return -EINVAL;
+        }
+        *pdatalen = data_count;
+        *pparmlen = parm_count;
+        return 0;
+}
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3089,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-                         CIFS_STD_OP);
+                         0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
                cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -3214,7 +3257,7 @@ setCifsAclRetry:
        return (rc);
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
@@ -5564,7 +5607,7 @@ QAllEAsRetry:
        }
        /* make sure list_len doesn't go past end of SMB */
-        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1..47d8ff62368 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,8 +52,8 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
+/* SMB echo "timeout" -- FIXME: tunable? */
-                         unsigned char *p24);
+#define SMB_ECHO_INTERVAL (60 * HZ)
 extern mempool_t *cifs_req_poolp;
@@ -64,8 +64,8 @@ struct smb_vol {
        char *UNC;
        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[16]; /* netbios name of client */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
@@ -84,6 +84,7 @@ struct smb_vol {
        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
        bool server_ino:1; /* use inode numbers from server ie UniqueId */
        bool direct_io:1;
+        bool strict_io:1; /* strict cache behavior */
        bool remap:1;      /* set to remap seven reserved chars in filenames */
        bool posix_paths:1; /* unset to not ask for posix pathnames. */
        bool no_linux_ext:1;
@@ -105,6 +106,7 @@ struct smb_vol {
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        char *prepath;
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
@@ -114,8 +116,9 @@ struct smb_vol {
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
-static int ipv4_connect(struct TCP_Server_Info *server);
+static int ip_connect(struct TCP_Server_Info *server);
-static int ipv6_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
 /*
@@ -150,6 +153,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
+        cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -161,7 +165,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
                }
        }
        spin_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
+        cFYI(1, "%s: tearing down socket", __func__);
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -178,30 +184,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
        kfree(server->session_key.response);
        server->session_key.response = NULL;
        server->session_key.len = 0;
+        server->lstrp = jiffies;
+        mutex_unlock(&server->srv_mutex);
+        /* mark submitted MIDs for retry and issue callback */
+        cFYI(1, "%s: issuing mid callbacks", __func__);
        spin_lock(&GlobalMid_Lock);
-        list_for_each(tmp, &server->pending_mid_q) {
+        list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                                        mid_q_entry,
+                if (mid_entry->midState == MID_REQUEST_SUBMITTED)
-                                        qhead);
-                if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                /* Mark other intransit requests as needing
-                                   retry so we do not immediately mark the
-                                   session bad again (ie after we reconnect
-                                   below) as they timeout too */
                        mid_entry->midState = MID_RETRY_NEEDED;
-                }
+                list_del_init(&mid_entry->qhead);
+                mid_entry->callback(mid_entry);
        }
        spin_unlock(&GlobalMid_Lock);
-        mutex_unlock(&server->srv_mutex);
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(server);
+                /* we should try only the port we connected to before */
-                else
+                rc = generic_ip_connect(server);
-                        rc = ipv4_connect(server);
                if (rc) {
                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
@@ -211,10 +214,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        if (server->tcpStatus != CifsExiting)
                                server->tcpStatus = CifsGood;
                        spin_unlock(&GlobalMid_Lock);
-        /*              atomic_set(&server->inFlight,0);*/
-                        wake_up(&server->response_q);
                }
        }
        return rc;
 }
@@ -228,9 +230,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 {
        struct smb_t2_rsp *pSMBt;
-        int total_data_size;
-        int data_in_this_rsp;
        int remaining;
+        __u16 total_data_size, data_in_this_rsp;
        if (pSMB->Command != SMB_COM_TRANSACTION2)
                return 0;
@@ -244,8 +245,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        pSMBt = (struct smb_t2_rsp *)pSMB;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - data_in_this_rsp;
@@ -271,21 +272,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
        struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
        struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-        int total_data_size;
-        int total_in_buf;
-        int remaining;
-        int total_in_buf2;
        char *data_area_of_target;
        char *data_area_of_buf2;
-        __u16 byte_count;
+        int remaining;
+        __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
+        if (total_data_size !=
+            get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
                cFYI(1, "total data size of primary and secondary t2 differ");
-        }
-        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - total_in_buf;
@@ -295,28 +293,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        if (remaining == 0) /* nothing to do, ignore */
                return 0;
-        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
+        total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
        data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-                                le16_to_cpu(pSMBt->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
        /* validate target area */
-        data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol +
+        data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-                                        le16_to_cpu(pSMB2->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
        data_area_of_target += total_in_buf;
        /* copy second buffer into end of first buffer */
        memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
        total_in_buf += total_in_buf2;
-        pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
+        put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
-        byte_count = le16_to_cpu(BCC_LE(pTargetSMB));
+        byte_count = get_bcc_le(pTargetSMB);
        byte_count += total_in_buf2;
-        BCC_LE(pTargetSMB) = cpu_to_le16(byte_count);
+        put_bcc_le(byte_count, pTargetSMB);
        byte_count = pTargetSMB->smb_buf_length;
        byte_count += total_in_buf2;
@@ -330,7 +328,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
+}
+static void
+cifs_echo_request(struct work_struct *work)
+{
+        int rc;
+        struct TCP_Server_Info *server = container_of(work,
+                                        struct TCP_Server_Info, echo.work);
+        /* no need to ping if we got a response recently */
+        if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+                goto requeue_echo;
+        rc = CIFSSMBEcho(server);
+        if (rc)
+                cFYI(1, "Unable to send echo request to server: %s",
+                        server->hostname);
+requeue_echo:
+        queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
 }
 static int
@@ -344,8 +361,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        struct msghdr smb_msg;
        struct kvec iov;
        struct socket *csocket = server->ssocket;
-        struct list_head *tmp;
+        struct list_head *tmp, *tmp2;
-        struct cifsSesInfo *ses;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
        char temp;
@@ -398,7 +414,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                smb_msg.msg_control = NULL;
                smb_msg.msg_controllen = 0;
                pdu_length = 4; /* enough to get RFC1001 header */
 incomplete_rcv:
+                if (echo_retries > 0 &&
+                    time_after(jiffies, server->lstrp +
+                                        (echo_retries * SMB_ECHO_INTERVAL))) {
+                        cERROR(1, "Server %s has not responded in %d seconds. "
+                                  "Reconnecting...", server->hostname,
+                                  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+                        cifs_reconnect(server);
+                        csocket = server->ssocket;
+                        wake_up(&server->response_q);
+                        continue;
+                }
                length =
                    kernel_recvmsg(csocket, &smb_msg,
                                &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -475,7 +504,7 @@ incomplete_rcv:
                         * initialize frame)
                         */
                        cifs_set_port((struct sockaddr *)
-                                        &server->addr.sockAddr, CIFS_PORT);
+                                        &server->dstaddr, CIFS_PORT);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -558,10 +587,11 @@ incomplete_rcv:
                        continue;
                }
+                mid_entry = NULL;
+                server->lstrp = jiffies;
-                task_to_wake = NULL;
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -602,20 +632,19 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                task_to_wake = mid_entry->tsk;
                                mid_entry->midState = MID_RESPONSE_RECEIVED;
+                                list_del_init(&mid_entry->qhead);
+                                mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
-                                /* so we do not time out requests to  server
-                                which is still responding (since server could
-                                be busy but not dead) */
-                                server->lstrp = jiffies;
                                break;
                        }
+                        mid_entry = NULL;
                }
                spin_unlock(&GlobalMid_Lock);
-                if (task_to_wake) {
+                if (mid_entry != NULL) {
                        /* Was previous buf put in mpx struct for multi-rsp? */
                        if (!isMultiRsp) {
                                /* smb buffer will be freed by user thread */
@@ -624,11 +653,10 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
-                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter);
+                                   "NumMids %d", atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -676,44 +704,16 @@ multi_t2_fnd:
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
-        /*
+        if (!list_empty(&server->pending_mid_q)) {
-         * BB: we shouldn't have to do any of this. It shouldn't be
-         * possible to exit from the thread with active SMB sessions
-         */
-        spin_lock(&cifs_tcp_ses_lock);
-        if (list_empty(&server->pending_mid_q)) {
-                /* loop through server session structures attached to this and
-                    mark them dead */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                        ses->server = NULL;
-                }
-                spin_unlock(&cifs_tcp_ses_lock);
-        } else {
-                /* although we can not zero the server struct pointer yet,
-                since there are active requests which may depnd on them,
-                mark the corresponding SMB sessions as exiting too */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                }
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
+                        cFYI(1, "Clearing Mid 0x%x - issuing callback",
-                                cFYI(1, "Clearing Mid 0x%x - waking up ",
                                         mid_entry->mid);
-                                task_to_wake = mid_entry->tsk;
+                        list_del_init(&mid_entry->qhead);
-                                if (task_to_wake)
+                        mid_entry->callback(mid_entry);
-                                        wake_up_process(task_to_wake);
-                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                spin_unlock(&cifs_tcp_ses_lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -731,18 +731,6 @@ multi_t2_fnd:
                coming home not much else we can do but free the memory */
        }
-        /* last chance to mark ses pointers invalid
-        if there are any pointing to this (e.g
-        if a crazy root user tried to kill cifsd
-        kernel thread explicitly this might happen) */
-        /* BB: This shouldn't be necessary, see above */
-        spin_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                ses->server = NULL;
-        }
-        spin_unlock(&cifs_tcp_ses_lock);
        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
        kfree(server);
@@ -805,24 +793,21 @@ cifs_parse_mount_options(char *options, const char *devname,
        short int override_gid = -1;
        bool uid_specified = false;
        bool gid_specified = false;
+        char *nodename = utsname()->nodename;
        separator[0] = ',';
        separator[1] = 0;
-        if (Local_System_Name[0] != 0)
+        /*
-                memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
+         * does not have to be perfect mapping since field is
-        else {
+         * informational, only used for servers that do not support
-                char *nodename = utsname()->nodename;
+         * port 445 and it can be overridden at mount time
-                int n = strnlen(nodename, 15);
+         */
-                memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
-                for (i = 0; i < n; i++) {
+        for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
-                        /* does not have to be perfect mapping since field is
+                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                        informational, only used for servers that do not support
-                        port 445 and it can be overridden at mount time */
+        vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
-                        vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                }
-        }
-        vol->source_rfc1001_name[15] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
@@ -839,6 +824,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* default to using server inode numbers where available */
        vol->server_ino = 1;
+        vol->actimeo = CIFS_DEF_ACTIMEO;
        if (!options)
                return 1;
@@ -984,13 +971,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -1115,6 +1100,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (!strnicmp(data, "uid", 3) && value && *value) {
                        vol->linux_uid = simple_strtoul(value, &value, 0);
                        uid_specified = true;
+                } else if (!strnicmp(data, "cruid", 5) && value && *value) {
+                        vol->cred_uid = simple_strtoul(value, &value, 0);
                } else if (!strnicmp(data, "forceuid", 8)) {
                        override_uid = 1;
                } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1167,22 +1154,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value || (*value == ' ')) {
                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
-                                memset(vol->source_rfc1001_name, 0x20, 15);
+                                memset(vol->source_rfc1001_name, 0x20,
-                                for (i = 0; i < 15; i++) {
+                                        RFC1001_NAME_LEN);
-                                /* BB are there cases in which a comma can be
+                                /*
-                                valid in this workstation netbios name (and need
+                                 * FIXME: are there cases in which a comma can
-                                special handling)? */
+                                 * be valid in workstation netbios name (and
+                                 * need special handling)?
-                                /* We do not uppercase netbiosname for user */
+                                 */
+                                for (i = 0; i < RFC1001_NAME_LEN; i++) {
+                                        /* don't ucase netbiosname for user */
                                        if (value[i] == 0)
                                                break;
-                                        else
+                                        vol->source_rfc1001_name[i] = value[i];
-                                                vol->source_rfc1001_name[i] =
-                                                                value[i];
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: netbiosname"
                                                " longer than 15 truncated.\n");
                        }
@@ -1192,7 +1179,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
-                                memset(vol->target_rfc1001_name, 0x20, 16);
+                                memset(vol->target_rfc1001_name, 0x20,
+                                        RFC1001_NAME_LEN_WITH_NULL);
                                for (i = 0; i < 15; i++) {
                                /* BB are there cases in which a comma can be
@@ -1209,10 +1197,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
+                } else if (strnicmp(data, "actimeo", 7) == 0) {
+                        if (value && *value) {
+                                vol->actimeo = HZ * simple_strtoul(value,
+                                                                   &value, 0);
+                                if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+                                        cERROR(1, "CIFS: attribute cache"
+                                                        "timeout too large");
+                                        return 1;
+                                }
+                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "version", 3) == 0) {
@@ -1330,10 +1328,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl", 5) == 0) {
                        vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                } else if (strnicmp(data, "locallease", 6) == 0) {
                        vol->local_lease = 1;
-#endif
                } else if (strnicmp(data, "sign", 4) == 0) {
                        vol->secFlg |= CIFSSEC_MUST_SIGN;
                } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1346,11 +1342,18 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
+                } else if (strnicmp(data, "strictcache", 11) == 0) {
+                        vol->strict_io = 1;
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+                                  "kernel config option set");
+                        return 1;
+#endif
                        vol->fsc = true;
                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
                        vol->mfsymlinks = true;
@@ -1438,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
 }
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        unsigned short int port, *sport;
+        switch (addr->sa_family) {
+        case AF_INET:
+                sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+                port = ((struct sockaddr_in *) addr)->sin_port;
+                break;
+        case AF_INET6:
+                sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+                port = ((struct sockaddr_in6 *) addr)->sin6_port;
+                break;
+        default:
+                WARN_ON(1);
+                return false;
+        }
+        if (!port) {
+                port = htons(CIFS_PORT);
+                if (port == *sport)
+                        return true;
+                port = htons(RFC1001_PORT);
+        }
+        return port == *sport;
+}
 static bool
 match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
              struct sockaddr *srcaddr)
 {
-        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
        switch (addr->sa_family) {
-        case AF_INET:
+        case AF_INET: {
-                if (addr4->sin_addr.s_addr !=
+                struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-                    server->addr.sockAddr.sin_addr.s_addr)
+                struct sockaddr_in *srv_addr4 =
-                        return false;
+                                        (struct sockaddr_in *)&server->dstaddr;
-                if (addr4->sin_port &&
-                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
                        return false;
                break;
-        case AF_INET6:
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+                struct sockaddr_in6 *srv_addr6 =
+                                        (struct sockaddr_in6 *)&server->dstaddr;
                if (!ipv6_addr_equal(&addr6->sin6_addr,
-                                     &server->addr.sockAddr6.sin6_addr))
+                                     &srv_addr6->sin6_addr))
-                        return false;
-                if (addr6->sin6_scope_id !=
-                    server->addr.sockAddr6.sin6_scope_id)
                        return false;
-                if (addr6->sin6_port &&
+                if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
-                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
                        return false;
                break;
        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
                return false;
@@ -1529,10 +1568,16 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+                if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                        continue;
                if (!match_address(server, addr,
                                   (struct sockaddr *)&vol->srcaddr))
                        continue;
+                if (!match_port(server, addr))
+                        continue;
                if (!match_security(server, vol))
                        continue;
@@ -1556,9 +1601,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
                return;
        }
+        put_net(cifs_net_ns(server));
        list_del_init(&server->tcp_ses_list);
        spin_unlock(&cifs_tcp_ses_lock);
+        cancel_delayed_work_sync(&server->echo);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
@@ -1628,6 +1677,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
+        cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
        tcp_ses->hostname = extract_hostname(volume_info->UNC);
        if (IS_ERR(tcp_ses->hostname)) {
                rc = PTR_ERR(tcp_ses->hostname);
@@ -1648,8 +1698,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
        tcp_ses->session_estab = false;
        tcp_ses->sequence_number = 0;
+        tcp_ses->lstrp = jiffies;
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+        INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
        /*
         * at this point we are the only ones with the pointer
@@ -1665,14 +1717,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                memcpy(&tcp_ses->dstaddr, sin_server6,
-                        sizeof(struct sockaddr_in6));
+                       sizeof(struct sockaddr_in6));
-                rc = ipv6_connect(tcp_ses);
+        } else
-        } else {
+                memcpy(&tcp_ses->dstaddr, sin_server,
-                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                       sizeof(struct sockaddr_in));
-                        sizeof(struct sockaddr_in));
-                rc = ipv4_connect(tcp_ses);
+        rc = ip_connect(tcp_ses);
-        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err_crypto_release;
@@ -1699,11 +1750,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        cifs_fscache_get_client_cookie(tcp_ses);
+        /* queue echo request delayed work */
+        queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
        return tcp_ses;
 out_err_crypto_release:
        cifs_crypto_shash_release(tcp_ses);
+        put_net(cifs_net_ns(tcp_ses));
 out_err:
        if (tcp_ses) {
                if (!IS_ERR(tcp_ses->hostname))
@@ -1777,6 +1833,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
        struct cifsSesInfo *ses;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        xid = GetXid();
@@ -1820,12 +1878,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        /* new SMB session uses our server ref */
        ses->server = server;
-        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+        if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(ses->serverName, "%pI6",
+                sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
-                        &server->addr.sockAddr6.sin6_addr);
        else
-                sprintf(ses->serverName, "%pI4",
+                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-                        &server->addr.sockAddr.sin_addr.s_addr);
        if (volume_info->username)
                strncpy(ses->userName, volume_info->username,
@@ -2120,19 +2176,106 @@ bind_socket(struct TCP_Server_Info *server)
 }
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        int val;
+        /*
-        bool connected = false;
+         * some servers require RFC1001 sessinit before sending
-        __be16 orig_port = 0;
+         * negprot - BB check reconnection in case where second
+         * sessinit is sent but no second negprot
+         */
+        struct rfc1002_session_packet *ses_init_buf;
+        struct smb_hdr *smb_buf;
+        ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+                               GFP_KERNEL);
+        if (ses_init_buf) {
+                ses_init_buf->trailer.session_req.called_len = 32;
+                if (server->server_RFC1001_name &&
+                    server->server_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      server->server_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      DEFAULT_CIFS_CALLED_NAME,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.calling_len = 32;
+                /*
+                 * calling name ends in null (byte 16) from old smb
+                 * convention.
+                 */
+                if (server->workstation_RFC1001_name &&
+                    server->workstation_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      server->workstation_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      "LINUX_CIFS_CLNT",
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.scope1 = 0;
+                ses_init_buf->trailer.session_req.scope2 = 0;
+                smb_buf = (struct smb_hdr *)ses_init_buf;
+                /* sizeof RFC1002_SESSION_REQUEST with no scope */
+                smb_buf->smb_buf_length = 0x81000044;
+                rc = smb_send(server, smb_buf, 0x44);
+                kfree(ses_init_buf);
+                /*
+                 * RFC1001 layer in at least one server
+                 * requires very short break before negprot
+                 * presumably because not expecting negprot
+                 * to follow so fast.  This is a simple
+                 * solution that works without
+                 * complicating the code and causes no
+                 * significant slowing down on mount
+                 * for everyone else
+                 */
+                usleep_range(1000, 2000);
+        }
+        /*
+         * else the negprot may still work without this
+         * even though malloc failed
+         */
+        return rc;
+}
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        unsigned short int sport;
+        int slen, sfamily;
        struct socket *socket = server->ssocket;
+        struct sockaddr *saddr;
+        saddr = (struct sockaddr *) &server->dstaddr;
+        if (server->dstaddr.ss_family == AF_INET6) {
+                sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+                slen = sizeof(struct sockaddr_in6);
+                sfamily = AF_INET6;
+        } else {
+                sport = ((struct sockaddr_in *) saddr)->sin_port;
+                slen = sizeof(struct sockaddr_in);
+                sfamily = AF_INET;
+        }
        if (socket == NULL) {
-                rc = sock_create_kern(PF_INET, SOCK_STREAM,
+                rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
-                                      IPPROTO_TCP, &socket);
+                                   IPPROTO_TCP, &socket, 1);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
+                        server->ssocket = NULL;
                        return rc;
                }
@@ -2140,63 +2283,28 @@ ipv4_connect(struct TCP_Server_Info *server)
                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket4(socket);
+                if (sfamily == AF_INET6)
+                        cifs_reclassify_socket6(socket);
+                else
+                        cifs_reclassify_socket4(socket);
        }
        rc = bind_socket(server);
        if (rc < 0)
                return rc;
-        /* user overrode default port */
+        rc = socket->ops->connect(socket, saddr, slen, 0);
-        if (server->addr.sockAddr.sin_port) {
+        if (rc < 0) {
-                rc = socket->ops->connect(socket, (struct sockaddr *)
+                cFYI(1, "Error %d connecting to server", rc);
-                                          &server->addr.sockAddr,
-                                          sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr.sin_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket,
-                                                (struct sockaddr *)
-                                                &server->addr.sockAddr,
-                                                sizeof(struct sockaddr_in), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              &server->addr.sockAddr,
-                                              sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
        /*
         * Eventually check for other socket options to change from
-         *  the default. sock_setsockopt not used because it expects
+         * the default. sock_setsockopt not used because it expects
-         *  user space buffer
+         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2210,7 +2318,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        }
        if (server->tcp_nodelay) {
-                val = 1;
+                int val = 1;
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
@@ -2221,161 +2329,39 @@ ipv4_connect(struct TCP_Server_Info *server)
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
-        /* send RFC1001 sessinit */
+        if (sport == htons(RFC1001_PORT))
-        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
+                rc = ip_rfc1001_connect(server);
-                /* some servers require RFC1001 sessinit before sending
-                negprot - BB check reconnection in case where second
-                sessinit is sent but no second negprot */
-                struct rfc1002_session_packet *ses_init_buf;
-                struct smb_hdr *smb_buf;
-                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-                                       GFP_KERNEL);
-                if (ses_init_buf) {
-                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (server->server_RFC1001_name &&
-                            server->server_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              server->server_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              DEFAULT_CIFS_CALLED_NAME,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.calling_len = 32;
-                        /* calling name ends in null (byte 16) from old smb
-                        convention. */
-                        if (server->workstation_RFC1001_name &&
-                            server->workstation_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              server->workstation_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              "LINUX_CIFS_CLNT",
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.scope1 = 0;
-                        ses_init_buf->trailer.session_req.scope2 = 0;
-                        smb_buf = (struct smb_hdr *)ses_init_buf;
-                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(server, smb_buf, 0x44);
-                        kfree(ses_init_buf);
-                        msleep(1); /* RFC1001 layer in at least one server
-                                      requires very short break before negprot
-                                      presumably because not expecting negprot
-                                      to follow so fast.  This is a simple
-                                      solution that works without
-                                      complicating the code and causes no
-                                      significant slowing down on mount
-                                      for everyone else */
-                }
-                /* else the negprot may still work without this
-                even though malloc failed */
-        }
        return rc;
 }
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-        int rc = 0;
+        unsigned short int *sport;
-        int val;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
-        bool connected = false;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
-        __be16 orig_port = 0;
-        struct socket *socket = server->ssocket;
-        if (socket == NULL) {
+        if (server->dstaddr.ss_family == AF_INET6)
-                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
+                sport = &addr6->sin6_port;
-                                      IPPROTO_TCP, &socket);
+        else
-                if (rc < 0) {
+                sport = &addr->sin_port;
-                        cERROR(1, "Error %d creating ipv6 socket", rc);
-                        socket = NULL;
-                        return rc;
-                }
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
+        if (*sport == 0) {
-                cFYI(1, "ipv6 Socket created");
+                int rc;
-                server->ssocket = socket;
-                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket6(socket);
-        }
-        rc = bind_socket(server);
+                /* try with 445 port at first */
-        if (rc < 0)
+                *sport = htons(CIFS_PORT);
-                return rc;
-        /* user overrode default port */
+                rc = generic_ip_connect(server);
-        if (server->addr.sockAddr6.sin6_port) {
-                rc = socket->ops->connect(socket,
-                                (struct sockaddr *) &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = true;
+                        return rc;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr6.sin6_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                                        &server->addr.sockAddr6,
-                                        sizeof(struct sockaddr_in6), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv6", rc);
-                sock_release(socket);
-                server->ssocket = NULL;
-                return rc;
-        }
-        /*
-         * Eventually check for other socket options to change from
-         * the default. sock_setsockopt not used because it expects
-         * user space buffer
-         */
-        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 5 * HZ;
-        if (server->tcp_nodelay) {
+                /* if it failed, try with 139 port */
-                val = 1;
+                *sport = htons(RFC1001_PORT);
-                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-                                (char *)&val, sizeof(val));
-                if (rc)
-                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-        server->ssocket = socket;
+        return generic_ip_connect(server);
-        return rc;
 }
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2565,6 +2551,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+        cifs_sb->actimeo = pvolume_info->actimeo;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
        if (pvolume_info->setuids)
@@ -2596,6 +2584,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (pvolume_info->multiuser)
                cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
                                            CIFS_MOUNT_NO_PERM);
+        if (pvolume_info->strict_io)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
        if (pvolume_info->direct_io) {
                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
@@ -2815,13 +2805,13 @@ remote_path_check:
        /* check if a whole path (including prepath) is not remote */
        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb);
+                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc != -EREMOTE) {
+                if (rc != 0 && rc != -EREMOTE) {
                        kfree(full_path);
                        goto mount_fail_check;
                }
@@ -2900,24 +2890,16 @@ remote_path_check:
                goto mount_fail_check;
        }
-        tlink->tl_index = pSesInfo->linux_uid;
+        tlink->tl_uid = pSesInfo->linux_uid;
        tlink->tl_tcon = tcon;
        tlink->tl_time = jiffies;
        set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
        set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-        rc = radix_tree_preload(GFP_KERNEL);
+        cifs_sb->master_tlink = tlink;
-        if (rc == -ENOMEM) {
-                kfree(tlink);
-                goto mount_fail_check;
-        }
        spin_lock(&cifs_sb->tlink_tree_lock);
-        radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
+        tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
-        radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
-                           CIFS_TLINK_MASTER_TAG);
        spin_unlock(&cifs_sb->tlink_tree_lock);
-        radix_tree_preload_end();
        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
                                TLINK_IDLE_EXPIRE);
@@ -2960,8 +2942,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        TCONX_RSP *pSMBr;
        unsigned char *bcc_ptr;
        int rc = 0;
-        int length, bytes_left;
+        int length;
-        __u16 count;
+        __u16 bytes_left, count;
        if (ses == NULL)
                return -EIO;
@@ -2989,7 +2971,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                bcc_ptr++;              /* skip password */
                /* already aligned so no need to do it below */
        } else {
-                pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* BB FIXME add code to fail this if NTLMv2 or Kerberos
                   specified as required (when that support is added to
                   the vfs in the future) as only NTLM or the much
@@ -3005,9 +2987,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
+                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+                                        bcc_ptr);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* must align unicode strings */
                        *bcc_ptr = 0; /* null byte password */
@@ -3045,7 +3028,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
-                         CIFS_STD_OP);
+                         0);
        /* above now done in SendReceive */
        if ((rc == 0) && (tcon != NULL)) {
@@ -3055,7 +3038,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                tcon->need_reconnect = false;
                tcon->tid = smb_buffer_response->Tid;
                bcc_ptr = pByteArea(smb_buffer_response);
-                bytes_left = BCC(smb_buffer_response);
+                bytes_left = get_bcc(smb_buffer_response);
                length = strnlen(bcc_ptr, bytes_left - 2);
                if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
                        is_unicode = true;
@@ -3107,32 +3090,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-        int i, ret;
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node;
+        struct tcon_link *tlink;
        char *tmp;
-        struct tcon_link *tlink[8];
-        unsigned long index = 0;
        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
-        do {
+        spin_lock(&cifs_sb->tlink_tree_lock);
-                spin_lock(&cifs_sb->tlink_tree_lock);
+        while ((node = rb_first(root))) {
-                ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
-                                             (void **)tlink, index,
+                cifs_get_tlink(tlink);
-                                             ARRAY_SIZE(tlink));
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-                /* increment index for next pass */
+                rb_erase(node, root);
-                if (ret > 0)
-                        index = tlink[ret - 1]->tl_index + 1;
-                for (i = 0; i < ret; i++) {
-                        cifs_get_tlink(tlink[i]);
-                        clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
-                        radix_tree_delete(&cifs_sb->tlink_tree,
-                                                        tlink[i]->tl_index);
-                }
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++)
+                spin_unlock(&cifs_sb->tlink_tree_lock);
-                        cifs_put_tlink(tlink[i]);
+                cifs_put_tlink(tlink);
-        } while (ret != 0);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
        tmp = cifs_sb->prepath;
        cifs_sb->prepathlen = 0;
@@ -3271,22 +3247,10 @@ out:
        return tcon;
 }
-static struct tcon_link *
+static inline struct tcon_link *
 cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
 {
-        struct tcon_link *tlink;
+        return cifs_sb->master_tlink;
-        unsigned int ret;
-        spin_lock(&cifs_sb->tlink_tree_lock);
-        ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
-                                        0, 1, CIFS_TLINK_MASTER_TAG);
-        spin_unlock(&cifs_sb->tlink_tree_lock);
-        /* the master tcon should always be present */
-        if (ret == 0)
-                BUG();
-        return tlink;
 }
 struct cifsTconInfo *
@@ -3302,6 +3266,47 @@ cifs_sb_tcon_pending_wait(void *unused)
        return signal_pending(current) ? -ERESTARTSYS : 0;
 }
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+        struct rb_node *node = root->rb_node;
+        struct tcon_link *tlink;
+        while (node) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                if (tlink->tl_uid > uid)
+                        node = node->rb_left;
+                else if (tlink->tl_uid < uid)
+                        node = node->rb_right;
+                else
+                        return tlink;
+        }
+        return NULL;
+}
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct tcon_link *tlink;
+        while (*new) {
+                tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+                parent = *new;
+                if (tlink->tl_uid > new_tlink->tl_uid)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&new_tlink->tl_rbnode, parent, new);
+        rb_insert_color(&new_tlink->tl_rbnode, root);
+}
 /*
 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
 * current task.
@@ -3309,7 +3314,7 @@ cifs_sb_tcon_pending_wait(void *unused)
 * If the superblock doesn't refer to a multiuser mount, then just return
 * the master tcon for the mount.
 *
- * First, search the radix tree for an existing tcon for this fsuid. If one
+ * First, search the rbtree for an existing tcon for this fsuid. If one
 * exists, then check to see if it's pending construction. If it is then wait
 * for construction to complete. Once it's no longer pending, check to see if
 * it failed and either return an error or retry construction, depending on
@@ -3322,14 +3327,14 @@ struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 {
        int ret;
-        unsigned long fsuid = (unsigned long) current_fsuid();
+        uid_t fsuid = current_fsuid();
        struct tcon_link *tlink, *newtlink;
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
                return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
        spin_lock(&cifs_sb->tlink_tree_lock);
-        tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+        tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
        if (tlink)
                cifs_get_tlink(tlink);
        spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3338,36 +3343,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
                newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
                if (newtlink == NULL)
                        return ERR_PTR(-ENOMEM);
-                newtlink->tl_index = fsuid;
+                newtlink->tl_uid = fsuid;
                newtlink->tl_tcon = ERR_PTR(-EACCES);
                set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
                set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
                cifs_get_tlink(newtlink);
-                ret = radix_tree_preload(GFP_KERNEL);
-                if (ret != 0) {
-                        kfree(newtlink);
-                        return ERR_PTR(ret);
-                }
                spin_lock(&cifs_sb->tlink_tree_lock);
                /* was one inserted after previous search? */
-                tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+                tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
                if (tlink) {
                        cifs_get_tlink(tlink);
                        spin_unlock(&cifs_sb->tlink_tree_lock);
-                        radix_tree_preload_end();
                        kfree(newtlink);
                        goto wait_for_construction;
                }
-                ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                radix_tree_preload_end();
-                if (ret) {
-                        kfree(newtlink);
-                        return ERR_PTR(ret);
-                }
                tlink = newtlink;
+                tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
        } else {
 wait_for_construction:
                ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3413,39 +3406,39 @@ cifs_prune_tlinks(struct work_struct *work)
 {
        struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
                                                    prune_tlinks.work);
-        struct tcon_link *tlink[8];
+        struct rb_root *root = &cifs_sb->tlink_tree;
-        unsigned long now = jiffies;
+        struct rb_node *node = rb_first(root);
-        unsigned long index = 0;
+        struct rb_node *tmp;
-        int i, ret;
+        struct tcon_link *tlink;
-        do {
+        /*
-                spin_lock(&cifs_sb->tlink_tree_lock);
+         * Because we drop the spinlock in the loop in order to put the tlink
-                ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+         * it's not guarded against removal of links from the tree. The only
-                                             (void **)tlink, index,
+         * places that remove entries from the tree are this function and
-                                             ARRAY_SIZE(tlink));
+         * umounts. Because this function is non-reentrant and is canceled
-                /* increment index for next pass */
+         * before umount can proceed, this is safe.
-                if (ret > 0)
+         */
-                        index = tlink[ret - 1]->tl_index + 1;
+        spin_lock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++) {
+        node = rb_first(root);
-                        if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
+        while (node != NULL) {
-                            atomic_read(&tlink[i]->tl_count) != 0 ||
+                tmp = node;
-                            time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
+                node = rb_next(tmp);
-                                       now)) {
+                tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
-                                tlink[i] = NULL;
-                                continue;
+                if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
-                        }
+                    atomic_read(&tlink->tl_count) != 0 ||
-                        cifs_get_tlink(tlink[i]);
+                    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
-                        clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+                        continue;
-                        radix_tree_delete(&cifs_sb->tlink_tree,
-                                          tlink[i]->tl_index);
-                }
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++) {
+                cifs_get_tlink(tlink);
-                        if (tlink[i] != NULL)
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-                                cifs_put_tlink(tlink[i]);
+                rb_erase(tmp, root);
-                }
-        } while (ret != 0);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
                                TLINK_IDLE_EXPIRE);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3840eddbfb7..dd5f22918c3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-static void setup_cifs_dentry(struct cifsTconInfo *tcon,
-                              struct dentry *direntry,
-                              struct inode *newinode)
-{
-        if (tcon->nocase)
-                direntry->d_op = &cifs_ci_dentry_ops;
-        else
-                direntry->d_op = &cifs_dentry_ops;
-        d_instantiate(direntry, newinode);
-}
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 int
@@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-                                        cifs_sb->local_nls,
+                                        current->tgid);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -329,7 +316,7 @@ cifs_create_get_file_info:
 cifs_create_set_dentry:
        if (rc == 0)
-                setup_cifs_dentry(tcon, direntry, newinode);
+                d_instantiate(direntry, newinode);
        else
                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
@@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                if (rc == 0)
                        d_instantiate(direntry, newinode);
@@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, newInode);
                if (posix_open) {
                        filp = lookup_instantiate_filp(nd, direntry,
@@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        } else if (rc == -ENOENT) {
                rc = 0;
                direntry->d_time = jiffies;
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
@@ -656,22 +631,37 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
-        int isValid = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        if (direntry->d_inode) {
                if (cifs_revalidate_dentry(direntry))
                        return 0;
-        } else {
+                else
-                cFYI(1, "neg dentry 0x%p name = %s",
+                        return 1;
-                         direntry, direntry->d_name.name);
-                if (time_after(jiffies, direntry->d_time + HZ) ||
-                        !lookupCacheEnabled) {
-                        d_drop(direntry);
-                        isValid = 0;
-                }
        }
-        return isValid;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+                return 0;
+        return 1;
 }
 /* static int cifs_d_delete(struct dentry *direntry)
@@ -685,12 +675,14 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 const struct dentry_operations cifs_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
+        .d_automount = cifs_dfs_d_automount,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *q)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
        unsigned long hash;
        int i;
@@ -703,21 +695,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
        return 0;
 }
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
+static int cifs_ci_compare(const struct dentry *parent,
-                           struct qstr *b)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
-        if ((a->len == b->len) &&
+        if ((name->len == len) &&
-            (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) {
+            (nls_strnicmp(codepage, name->name, str, len) == 0))
-                /*
-                 * To preserve case, don't let an existing negative dentry's
-                 * case take precedence.  If a is not a negative dentry, this
-                 * should have no side effects
-                 */
-                memcpy((void *)a->name, b->name, a->len);
                return 0;
-        }
        return 1;
 }
@@ -725,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
        .d_hash = cifs_ci_hash,
        .d_compare = cifs_ci_compare,
+        .d_automount = cifs_dfs_d_automount,
 };
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad..548f06230a6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* Search for server name delimiter */
        sep = memchr(hostname, '\\', len);
        if (sep)
-                len = sep - unc;
+                len = sep - hostname;
        else
                cFYI(1, "%s: probably server name is whole unc: %s",
                     __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7f..0de17c1db60 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,58 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
-        char *full_path, int xid)
-{
-        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-        struct timespec temp;
-        int rc;
-        if (pCifsInode->clientCanCacheRead) {
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto client_can_cache;
-        }
-        /* BB need same check in cifs_create too? */
-        /* if not oplocked, invalidate inode pages if mtime or file
-           size changed */
-        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-        if (timespec_equal(&inode->i_mtime, &temp) &&
-                           (inode->i_size ==
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, "inode unchanged on server");
-        } else {
-                if (inode->i_mapping) {
-                        /* BB no need to lock inode until after invalidate
-                        since namei code should already have it locked? */
-                        rc = filemap_write_and_wait(inode->i_mapping);
-                        mapping_set_error(inode->i_mapping, rc);
-                }
-                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed");
-                invalidate_remote_inode(inode);
-        }
-client_can_cache:
-        if (pTcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-                                              xid);
-        else
-                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, NULL);
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
-        return rc;
-}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                        struct super_block *sb, int mode, unsigned int f_flags,
                        __u32 *poplock, __u16 *pnetfid, int xid)
@@ -218,6 +166,76 @@ posix_open_ret:
        return rc;
 }
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             __u16 *pnetfid, int xid)
+{
+        int rc;
+        int desiredAccess;
+        int disposition;
+        FILE_ALL_INFO *buf;
+        desiredAccess = cifs_convert_flags(f_flags);
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *      POSIX Flag            CIFS Disposition
+ *      ----------            ----------------
+ *      O_CREAT               FILE_OPEN_IF
+ *      O_CREAT | O_EXCL      FILE_CREATE
+ *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *      O_TRUNC               FILE_OVERWRITE
+ *      none of the above     FILE_OPEN
+ *
+ *      Note that there is not a direct match between disposition
+ *      FILE_SUPERSEDE (ie create whether or not file exists although
+ *      O_CREAT | O_TRUNC is similar but truncates the existing
+ *      file rather than creating a new file as FILE_SUPERSEDE does
+ *      (which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+        disposition = cifs_get_disposition(f_flags);
+        /* BB pass O_SYNC flag through on file attributes .. BB */
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+                        desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto out;
+        if (tcon->unix_ext)
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                                              xid);
+        else
+                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+                                         xid, pnetfid);
+out:
+        kfree(buf);
+        return rc;
+}
 struct cifsFileInfo *
 cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                  struct tcon_link *tlink, __u32 oplock)
@@ -253,12 +271,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
        spin_unlock(&cifs_file_list_lock);
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock inode %p", inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
        file->private_data = pCifsFile;
        return pCifsFile;
@@ -271,8 +284,10 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
+        struct inode *inode = cifs_file->dentry->d_inode;
        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
-        struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsLockInfo *li, *tmp;
        spin_lock(&cifs_file_list_lock);
@@ -288,8 +303,14 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        if (list_empty(&cifsi->openFileList)) {
                cFYI(1, "closing last open instance for inode %p",
                        cifs_file->dentry->d_inode);
-                cifsi->clientCanCacheRead = false;
-                cifsi->clientCanCacheAll  = false;
+                /* in strict cache mode we need invalidate mapping on the last
+                   close  because it may cause a error when we open this file
+                   again and get at least level II oplock */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+                        CIFS_I(inode)->invalid_mapping = true;
+                cifs_set_oplock_level(cifsi, 0);
        }
        spin_unlock(&cifs_file_list_lock);
@@ -327,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
-        int desiredAccess;
+        bool posix_open_ok = false;
-        int disposition;
        __u16 netfid;
-        FILE_ALL_INFO *buf = NULL;
        xid = GetXid();
@@ -368,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
+                        posix_open_ok = true;
-                        pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-                                                      oplock);
-                        if (pCifsFile == NULL) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                rc = -ENOMEM;
-                        }
-                        cifs_fscache_set_inode_cookie(inode, file);
-                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
                                cERROR(1, "server %s of type %s returned"
@@ -395,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file)
                   or DFS errors */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        if (!posix_open_ok) {
+                rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-/*********************************************************************
+                                  file->f_flags, &oplock, &netfid, xid);
- *  open flag mapping table:
+                if (rc)
- *
+                        goto out;
- *      POSIX Flag            CIFS Disposition
- *      ----------            ----------------
- *      O_CREAT               FILE_OPEN_IF
- *      O_CREAT | O_EXCL      FILE_CREATE
- *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *      O_TRUNC               FILE_OVERWRITE
- *      none of the above     FILE_OPEN
- *
- *      Note that there is not a direct match between disposition
- *      FILE_SUPERSEDE (ie create whether or not file exists although
- *      O_CREAT | O_TRUNC is similar but truncates the existing
- *      file rather than creating a new file as FILE_SUPERSEDE does
- *      (which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-        disposition = cifs_get_disposition(file->f_flags);
-        /* BB pass O_SYNC flag through on file attributes .. BB */
-        /* Also refresh inode by passing in file_info buf returned by SMBOpen
-           and calling get_inode_info with returned buf (at least helps
-           non-Unix server case) */
-        /* BB we can not do this if this is the second open of a file
-           and the first handle has writebehind data, we might be
-           able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        if (tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else
-                rc = -EIO; /* no NT SMB support fall into legacy open below */
-        if (rc == -EIO) {
-                /* Old server, try legacy style OpenX */
-                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-                        desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        }
-        if (rc) {
-                cFYI(1, "cifs_open returned 0x%x", rc);
-                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
-        if (rc != 0)
-                goto out;
        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
        if (pCifsFile == NULL) {
+                CIFSSMBClose(xid, tcon, netfid);
                rc = -ENOMEM;
                goto out;
        }
        cifs_fscache_set_inode_cookie(inode, file);
-        if (oplock & CIFS_CREATE_ACTION) {
+        if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
-                if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
-                        struct cifs_unix_set_info_args args = {
+                        .mode   = inode->i_mode,
-                                .mode   = inode->i_mode,
+                        .uid    = NO_CHANGE_64,
-                                .uid    = NO_CHANGE_64,
+                        .gid    = NO_CHANGE_64,
-                                .gid    = NO_CHANGE_64,
+                        .ctime  = NO_CHANGE_64,
-                                .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
+                        .device = 0,
-                                .device = 0,
+                };
-                        };
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                        pCifsFile->pid);
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                }
        }
 out:
-        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
        cifs_put_tlink(tlink);
@@ -607,8 +552,6 @@ reopen_success:
                rc = filemap_write_and_wait(inode->i_mapping);
                mapping_set_error(inode->i_mapping, rc);
-                pCifsInode->clientCanCacheAll = false;
-                pCifsInode->clientCanCacheRead = false;
                if (tcon->unix_ext)
                        rc = cifs_get_inode_info_unix(&inode,
                                full_path, inode->i_sb, xid);
@@ -622,18 +565,9 @@ reopen_success:
             invalidate the current end of file on the server
             we can not go to the server to get the new inod
             info */
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                pCifsInode->clientCanCacheAll = true;
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         pCifsFile->dentry->d_inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ) {
-                pCifsInode->clientCanCacheRead = true;
-                pCifsInode->clientCanCacheAll = false;
-        } else {
-                pCifsInode->clientCanCacheRead = false;
-                pCifsInode->clientCanCacheAll = false;
-        }
        cifs_relock_file(pCifsFile);
 reopen_error_exit:
@@ -775,12 +709,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -806,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                /* BB we could chain these into one lock request BB */
                rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-                                 0, 1, lockType, 0 /* wait flag */ );
+                                 0, 1, lockType, 0 /* wait flag */, 0);
                if (rc == 0) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
                                         pfLock->fl_start, 1 /* numUnlock */ ,
                                         0 /* numLock */ , lockType,
-                                         0 /* wait flag */ );
+                                         0 /* wait flag */, 0);
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
                                cERROR(1, "Error unlocking previously locked "
@@ -828,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                rc = CIFSSMBLock(xid, tcon, netfid, length,
                                        pfLock->fl_start, 0, 1,
                                        lockType | LOCKING_ANDX_SHARED_LOCK,
-                                        0 /* wait flag */);
+                                        0 /* wait flag */, 0);
                                if (rc == 0) {
                                        rc = CIFSSMBLock(xid, tcon, netfid,
                                                length, pfLock->fl_start, 1, 0,
                                                lockType |
                                                LOCKING_ANDX_SHARED_LOCK,
-                                                0 /* wait flag */);
+                                                0 /* wait flag */, 0);
                                        pfLock->fl_type = F_RDLCK;
                                        if (rc != 0)
                                                cERROR(1, "Error unlocking "
@@ -877,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                if (numLock) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
-                                        pfLock->fl_start,
+                                         pfLock->fl_start, 0, numLock, lockType,
-                                        0, numLock, lockType, wait_flag);
+                                         wait_flag, 0);
                        if (rc == 0) {
                                /* For Windows locks we must store them. */
@@ -898,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                (pfLock->fl_start + length) >=
                                                (li->offset + li->length)) {
                                        stored_rc = CIFSSMBLock(xid, tcon,
-                                                        netfid,
+                                                        netfid, li->length,
-                                                        li->length, li->offset,
+                                                        li->offset, 1, 0,
-                                                        1, 0, li->type, false);
+                                                        li->type, false, 0);
                                        if (stored_rc)
                                                rc = stored_rc;
                                        else {
@@ -919,31 +847,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        return rc;
 }
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
-        if (offset <= cifsi->server_eof)
-                return CIFS_STD_OP;
-        else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
-                return CIFS_VLONG_OP;
-        else
-                return CIFS_LONG_OP;
-}
 /* update the file size (if needed) after a write */
-static void
+void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                      unsigned int bytes_written)
 {
@@ -956,14 +861,15 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
        unsigned int bytes_written = 0;
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -982,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1010,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                min_t(const int, cifs_sb->wsize,
                                      write_size - total_written),
                                *poffset, &bytes_written,
-                                NULL, write_data + total_written, long_op);
+                                NULL, write_data + total_written, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1023,27 +928,21 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                struct inode *inode = file->f_path.dentry->d_inode;
 /* Do not update local mtime - server will set its actual value on write
- *              inode->i_ctime = inode->i_mtime =
+ *      inode->i_ctime = inode->i_mtime =
- *                      current_fs_time(inode->i_sb);*/
+ *              current_fs_time(inode->i_sb);*/
-                if (total_written > 0) {
+        if (total_written > 0) {
-                        spin_lock(&inode->i_lock);
+                spin_lock(&inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
+                if (*poffset > inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
+                        i_size_write(inode, *poffset);
-                                        *poffset);
+                spin_unlock(&inode->i_lock);
-                        spin_unlock(&inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(inode);
        FreeXid(xid);
        return total_written;
 }
@@ -1057,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
        struct dentry *dentry = open_file->dentry;
        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
@@ -1070,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1100,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                rc = CIFSSMBWrite2(xid, pTcon,
                                                open_file->netfid, len,
                                                *poffset, &bytes_written,
-                                                iov, 1, long_op);
+                                                iov, 1, 0);
                        } else
                                rc = CIFSSMBWrite(xid, pTcon,
                                         open_file->netfid,
@@ -1108,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                               write_size - total_written),
                                         *poffset, &bytes_written,
                                         write_data + total_written,
-                                         NULL, long_op);
+                                         NULL, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1121,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
@@ -1138,7 +1034,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        return total_written;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
 {
@@ -1172,13 +1067,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
-#endif
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        struct cifs_sb_info *cifs_sb;
        bool any_available = false;
        int rc;
@@ -1192,6 +1086,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
                return NULL;
        }
+        cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
        /* only filter by fsuid on multiuser mounts */
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
                fsuid_only = false;
@@ -1322,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping,
        struct pagevec pvec;
        int rc = 0;
        int scanned = 0;
-        int xid, long_op;
+        int xid;
        cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1460,43 +1356,67 @@ retry:
                                break;
                }
                if (n_iov) {
+retry_write:
                        open_file = find_writable_file(CIFS_I(mapping->host),
                                                        false);
                        if (!open_file) {
                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
-                                long_op = cifs_write_timeout(cifsi, offset);
                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   long_op);
+                                                   0);
                                cifsFileInfo_put(open_file);
-                                cifs_update_eof(cifsi, offset, bytes_written);
                        }
-                        if (rc || bytes_written < bytes_to_write) {
+                        cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
-                                cERROR(1, "Write2 ret %d, wrote %d",
-                                          rc, bytes_written);
+                        /*
-                                mapping_set_error(mapping, rc);
+                         * For now, treat a short write as if nothing got
-                        } else {
+                         * written. A zero length write however indicates
+                         * ENOSPC or EFBIG. We have no way to know which
+                         * though, so call it ENOSPC for now. EFBIG would
+                         * get translated to AS_EIO anyway.
+                         *
+                         * FIXME: make it take into account the data that did
+                         *        get written
+                         */
+                        if (rc == 0) {
+                                if (bytes_written == 0)
+                                        rc = -ENOSPC;
+                                else if (bytes_written < bytes_to_write)
+                                        rc = -EAGAIN;
+                        }
+                        /* retry on data-integrity flush */
+                        if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+                                goto retry_write;
+                        /* fix the stats and EOF */
+                        if (bytes_written > 0) {
                                cifs_stats_bytes_written(tcon, bytes_written);
+                                cifs_update_eof(cifsi, offset, bytes_written);
                        }
                        for (i = 0; i < n_iov; i++) {
                                page = pvec.pages[first + i];
-                                /* Should we also set page error on
+                                /* on retryable write error, redirty page */
-                                success rc but too little data written? */
+                                if (rc == -EAGAIN)
-                                /* BB investigate retry logic on temporary
+                                        redirty_page_for_writepage(wbc, page);
-                                server crash cases and how recovery works
+                                else if (rc != 0)
-                                when page marked as error */
-                                if (rc)
                                        SetPageError(page);
                                kunmap(page);
                                unlock_page(page);
                                end_page_writeback(page);
                                page_cache_release(page);
                        }
+                        if (rc != -EAGAIN)
+                                mapping_set_error(mapping, rc);
+                        else
+                                rc = 0;
                        if ((wbc->nr_to_write -= n_iov) <= 0)
                                done = 1;
                        index = next;
@@ -1608,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
        struct cifsTconInfo *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        xid = GetXid();
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
                file->f_path.dentry->d_name.name, datasync);
-        rc = filemap_write_and_wait(inode->i_mapping);
+        if (!CIFS_I(inode)->clientCanCacheRead)
-        if (rc == 0) {
+                cifs_invalidate_mapping(inode);
-                struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-                tcon = tlink_tcon(smbfile->tlink);
+        tcon = tlink_tcon(smbfile->tlink);
-                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                        rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
-        }
+        FreeXid(xid);
+        return rc;
+}
+int cifs_fsync(struct file *file, int datasync)
+{
+        int xid;
+        int rc = 0;
+        struct cifsTconInfo *tcon;
+        struct cifsFileInfo *smbfile = file->private_data;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        xid = GetXid();
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
+                file->f_path.dentry->d_name.name, datasync);
+        tcon = tlink_tcon(smbfile->tlink);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
        FreeXid(xid);
        return rc;
@@ -1679,42 +1619,242 @@ int cifs_flush(struct file *file, fl_owner_t id)
        return rc;
 }
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
+static int
-        size_t read_size, loff_t *poffset)
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
 {
-        int rc = -EACCES;
+        int rc = 0;
-        unsigned int bytes_read = 0;
+        unsigned long i;
-        unsigned int total_read = 0;
-        unsigned int current_read_size;
+        for (i = 0; i < num_pages; i++) {
-        struct cifs_sb_info *cifs_sb;
+                pages[i] = alloc_page(__GFP_HIGHMEM);
+                if (!pages[i]) {
+                        /*
+                         * save number of pages we have already allocated and
+                         * return with ENOMEM error
+                         */
+                        num_pages = i;
+                        rc = -ENOMEM;
+                        goto error;
+                }
+        }
+        return rc;
+error:
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        return rc;
+}
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+        size_t num_pages;
+        size_t clen;
+        clen = min_t(const size_t, len, wsize);
+        num_pages = clen / PAGE_CACHE_SIZE;
+        if (clen % PAGE_CACHE_SIZE)
+                num_pages++;
+        if (cur_len)
+                *cur_len = clen;
+        return num_pages;
+}
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
+{
+        size_t total_written = 0, written = 0;
+        unsigned long num_pages, npages;
+        size_t copied, len, cur_len, i;
+        struct kvec *to_send;
+        struct page **pages;
+        struct iov_iter it;
+        struct inode *inode;
+        struct cifsFileInfo *open_file;
        struct cifsTconInfo *pTcon;
+        struct cifs_sb_info *cifs_sb;
+        int xid, rc;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
+        rc = generic_write_checks(file, poffset, &len, 0);
+        if (rc)
+                return rc;
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+        if (!pages)
+                return -ENOMEM;
+        to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+        if (!to_send) {
+                kfree(pages);
+                return -ENOMEM;
+        }
+        rc = cifs_write_allocate_pages(pages, num_pages);
+        if (rc) {
+                kfree(pages);
+                kfree(to_send);
+                return rc;
+        }
+        xid = GetXid();
+        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
+        inode = file->f_path.dentry->d_inode;
+        iov_iter_init(&it, iov, nr_segs, len, 0);
+        npages = num_pages;
+        do {
+                size_t save_len = cur_len;
+                for (i = 0; i < npages; i++) {
+                        copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+                        copied = iov_iter_copy_from_user(pages[i], &it, 0,
+                                                         copied);
+                        cur_len -= copied;
+                        iov_iter_advance(&it, copied);
+                        to_send[i+1].iov_base = kmap(pages[i]);
+                        to_send[i+1].iov_len = copied;
+                }
+                cur_len = save_len - cur_len;
+                do {
+                        if (open_file->invalidHandle) {
+                                rc = cifs_reopen_file(open_file, false);
+                                if (rc != 0)
+                                        break;
+                        }
+                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+                                           cur_len, *poffset, &written,
+                                           to_send, npages, 0);
+                } while (rc == -EAGAIN);
+                for (i = 0; i < npages; i++)
+                        kunmap(pages[i]);
+                if (written) {
+                        len -= written;
+                        total_written += written;
+                        cifs_update_eof(CIFS_I(inode), *poffset, written);
+                        *poffset += written;
+                } else if (rc < 0) {
+                        if (!total_written)
+                                total_written = rc;
+                        break;
+                }
+                /* get length and number of kvecs of the next write */
+                npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        } while (len > 0);
+        if (total_written > 0) {
+                spin_lock(&inode->i_lock);
+                if (*poffset > inode->i_size)
+                        i_size_write(inode, *poffset);
+                spin_unlock(&inode->i_lock);
+        }
+        cifs_stats_bytes_written(pTcon, total_written);
+        mark_inode_dirty_sync(inode);
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        kfree(to_send);
+        kfree(pages);
+        FreeXid(xid);
+        return total_written;
+}
+static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+        ssize_t written;
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        /*
+         * BB - optimize the way when signing is disabled. We can drop this
+         * extra memory-to-memory copying and use iovec buffers for constructing
+         * write request.
+         */
+        written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+        if (written > 0) {
+                CIFS_I(inode)->invalid_mapping = true;
+                iocb->ki_pos = pos;
+        }
+        return written;
+}
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                           unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to write the data to the server exactly
+         * from the pos to pos+len-1 rather than flush all affected pages
+         * because it may cause a error with mandatory locks on these pages but
+         * not on the region from pos to ppos+len-1.
+         */
+        return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
+static ssize_t
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
+{
+        int rc;
        int xid;
+        unsigned int total_read, bytes_read = 0;
+        size_t len, cur_len;
+        int iov_offset = 0;
+        struct cifs_sb_info *cifs_sb;
+        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *open_file;
-        char *smb_read_data;
-        char __user *current_offset;
        struct smb_com_read_rsp *pSMBr;
+        char *read_data;
+        if (!nr_segs)
+                return 0;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
-        for (total_read = 0, current_offset = read_data;
+        for (total_read = 0; total_read < len; total_read += bytes_read) {
-             read_size > total_read;
+                cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
-             total_read += bytes_read, current_offset += bytes_read) {
-                current_read_size = min_t(const int, read_size - total_read,
-                                          cifs_sb->rsize);
                rc = -EAGAIN;
-                smb_read_data = NULL;
+                read_data = NULL;
                while (rc == -EAGAIN) {
                        int buf_type = CIFS_NO_BUFFER;
                        if (open_file->invalidHandle) {
@@ -1722,27 +1862,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon,
+                        rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
-                                         open_file->netfid,
+                                         cur_len, *poffset, &bytes_read,
-                                         current_read_size, *poffset,
+                                         &read_data, &buf_type);
-                                         &bytes_read, &smb_read_data,
+                        pSMBr = (struct smb_com_read_rsp *)read_data;
-                                         &buf_type);
+                        if (read_data) {
-                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
+                                char *data_offset = read_data + 4 +
-                        if (smb_read_data) {
+                                                le16_to_cpu(pSMBr->DataOffset);
-                                if (copy_to_user(current_offset,
+                                if (memcpy_toiovecend(iov, data_offset,
-                                                smb_read_data +
+                                                      iov_offset, bytes_read))
-                                                4 /* RFC1001 length field */ +
-                                                le16_to_cpu(pSMBr->DataOffset),
-                                                bytes_read))
                                        rc = -EFAULT;
                                if (buf_type == CIFS_SMALL_BUFFER)
-                                        cifs_small_buf_release(smb_read_data);
+                                        cifs_small_buf_release(read_data);
                                else if (buf_type == CIFS_LARGE_BUFFER)
-                                        cifs_buf_release(smb_read_data);
+                                        cifs_buf_release(read_data);
-                                smb_read_data = NULL;
+                                read_data = NULL;
+                                iov_offset += bytes_read;
                        }
                }
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
                                break;
@@ -1755,13 +1893,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                        *poffset += bytes_read;
                }
        }
        FreeXid(xid);
        return total_read;
 }
+ssize_t cifs_user_read(struct file *file, char __user *read_data,
+                       size_t read_size, loff_t *poffset)
+{
+        struct iovec iov;
+        iov.iov_base = read_data;
+        iov.iov_len = read_size;
+        return cifs_iovec_read(file, &iov, 1, poffset);
+}
+static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+        ssize_t read;
+        read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+        if (read > 0)
+                iocb->ki_pos = pos;
+        return read;
+}
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                          unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheRead)
+                return generic_file_aio_read(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to read from the server all the time
+         * if we don't have level II oplock because the server can delay mtime
+         * change - so we can't make a decision about inode invalidating.
+         * And we can also fail with pagereading if there are mandatory locks
+         * on pages affected by this read but not on the region from pos to
+         * pos+len-1.
+         */
+        return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
 static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-        loff_t *poffset)
+                         loff_t *poffset)
 {
        int rc = -EACCES;
        unsigned int bytes_read = 0;
@@ -1829,6 +2011,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        return total_read;
 }
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int rc, xid;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        xid = GetXid();
+        if (!CIFS_I(inode)->clientCanCacheRead)
+                cifs_invalidate_mapping(inode);
+        rc = generic_file_mmap(file, vma);
+        FreeXid(xid);
+        return rc;
+}
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int rc, xid;
@@ -2275,7 +2472,8 @@ void cifs_oplock_break(struct work_struct *work)
         */
        if (!cfile->oplock_break_cancelled) {
                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
+                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+                                 cinode->clientCanCacheRead ? 1 : 0);
                cFYI(1, "Oplock release rc = %d", rc);
        }
@@ -2299,8 +2497,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
+        struct super_block *sb = cfile->dentry->d_sb;
        cifsFileInfo_put(cfile);
-        cifs_sb_deactive(cfile->dentry->d_sb);
+        cifs_sb_deactive(sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe..297a43d0ff7 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
 *   fs/cifs/fscache.c - CIFS filesystem cache interface
 *
 *   Copyright (c) 2010 Novell, Inc.
- *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
 *
 *   This library is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
        if (cifsi->fscache)
                return;
-        cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+                cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
                                &cifs_fscache_inode_object_def, cifsi);
-        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+                cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
                                cifsi->fscache);
+        }
 }
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
 {
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                cifs_fscache_disable_inode_cookie(inode);
-        else {
+        else
                cifs_fscache_enable_inode_cookie(inode);
-                cFYI(1, "CIFS: fscache inode cookie set");
-        }
 }
 void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3ef..8852470b4fb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
 #include "fscache.h"
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                                inode->i_fop = &cifs_file_direct_nobrl_ops;
                        else
                                inode->i_fop = &cifs_file_direct_ops;
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                                inode->i_fop = &cifs_file_strict_nobrl_ops;
+                        else
+                                inode->i_fop = &cifs_file_strict_ops;
                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                        inode->i_fop = &cifs_file_nobrl_ops;
                else { /* not direct, send byte range locks */
                        inode->i_fop = &cifs_file_ops;
                }
                /* check if server can support readpages */
                if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
                                PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                break;
        case S_IFDIR:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-                if (is_dfs_referral) {
+                if (IS_AUTOMOUNT(inode)) {
                        inode->i_op = &cifs_dfs_referral_inode_operations;
                } else {
 #else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        }
        spin_unlock(&inode->i_lock);
-        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+        if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+                inode->i_flags |= S_AUTOMOUNT;
+        cifs_set_ops(inode);
 }
 void
@@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -686,13 +693,18 @@ int cifs_get_inode_info(struct inode **pinode,
                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, "Getting mode bits from ACL");
+                rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
-                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
+                                                pfid);
+                if (rc) {
+                        cFYI(1, "%s: Getting ACL failed with error: %d",
+                                __func__, rc);
+                        goto cgii_exit;
+                }
        }
-#endif
+#endif /* CONFIG_CIFS_ACL */
        /* fill in remaining high mode bits e.g. SUID, VTX */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +735,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                struct cifsTconInfo *tcon)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
        char *full_path = NULL;
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        /* if no prefix path, simply set path to the root of share to "" */
        if (pplen == 0) {
@@ -774,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /* use createtime like an i_generation field */
+        if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+                return 0;
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
                return 0;
@@ -791,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        CIFS_I(inode)->createtime = fattr->cf_createtime;
        return 0;
 }
@@ -804,14 +821,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        return true;
                }
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return false;
 }
@@ -870,7 +887,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
        char *full_path;
        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        full_path = cifs_build_path_to_root(cifs_sb);
+        full_path = cifs_build_path_to_root(cifs_sb, tcon);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -881,8 +898,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
+                goto out;
+        }
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
@@ -898,13 +917,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
        } else if (rc) {
-                kfree(full_path);
-                _FreeXid(xid);
                iget_failed(inode);
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
        }
+out:
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
@@ -1313,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
        to set uid/gid */
                        inc_nlink(inode);
-                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
-                        else
-                                direntry->d_op = &cifs_dentry_ops;
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
                        cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1357,10 +1370,6 @@ mkdir_get_info:
                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
                                                 inode->i_sb, xid, NULL);
-                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
-                else
-                        direntry->d_op = &cifs_dentry_ops;
                d_instantiate(direntry, newinode);
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
@@ -1648,6 +1657,7 @@ static bool
 cifs_inode_needs_reval(struct inode *inode)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_i->clientCanCacheRead)
                return false;
@@ -1658,20 +1668,22 @@ cifs_inode_needs_reval(struct inode *inode)
        if (cifs_i->time == 0)
                return true;
-        /* FIXME: the actimeo should be tunable */
+        if (!time_in_range(jiffies, cifs_i->time,
-        if (time_after_eq(jiffies, cifs_i->time + HZ))
+                                cifs_i->time + cifs_sb->actimeo))
                return true;
        /* hardlinked files w/ noserverino get "special" treatment */
-        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
                return true;
        return false;
 }
-/* check invalid_mapping flag and zap the cache if it's set */
+/*
-static void
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
+void
 cifs_invalidate_mapping(struct inode *inode)
 {
        int rc;
@@ -2114,11 +2126,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        rc = mode_to_acl(inode, full_path, mode);
+                        rc = mode_to_cifs_acl(inode, full_path, mode);
-                else
+                        if (rc) {
-#endif
+                                cFYI(1, "%s: Setting ACL failed with error: %d",
+                                        __func__, rc);
+                                goto cifs_setattr_exit;
+                        }
+                } else
+#endif /* CONFIG_CIFS_ACL */
                if (((mode & S_IWUGO) == 0) &&
                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
@@ -2177,7 +2194,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        setattr_copy(inode, attrs);
        mark_inode_dirty(inode);
-        return 0;
 cifs_setattr_exit:
        kfree(full_path);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f34..0c98672d012 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
        struct cifsFileInfo *pSMBFile = filep->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
+        struct cifsTconInfo *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
-        __u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+        __u64   caps;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
@@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
 #ifdef CONFIG_CIFS_POSIX
                case FS_IOC_GETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
-                                if (pSMBFile == NULL)
-                                        break;
                                rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
                                        &ExtAttrBits, &ExtAttrMask);
                                if (rc == 0)
@@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
                case FS_IOC_SETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
                                if (get_user(ExtAttrBits, (int __user *)arg)) {
                                        rc = -EFAULT;
                                        break;
                                }
-                                if (pSMBFile == NULL)
-                                        break;
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 85cdbf831e7..02cd60aefbf 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
-#include "md5.h"
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -47,6 +46,44 @@
        md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
 static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md5;
+        struct sdesc *sdescmd5;
+        md5 = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(md5)) {
+                cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+                return PTR_ERR(md5);
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+        sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd5) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto symlink_hash_err;
+        }
+        sdescmd5->shash.tfm = md5;
+        sdescmd5->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md5 shash\n", __func__);
+                goto symlink_hash_err;
+        }
+        crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+symlink_hash_err:
+        crypto_free_shash(md5);
+        kfree(sdescmd5);
+        return rc;
+}
+static int
 CIFSParseMFSymlink(const u8 *buf,
                   unsigned int buf_len,
                   unsigned int *_link_len,
@@ -56,7 +93,6 @@ CIFSParseMFSymlink(const u8 *buf,
        unsigned int link_len;
        const char *md5_str1;
        const char *link_str;
-        struct MD5Context md5_ctx;
        u8 md5_hash[16];
        char md5_str2[34];
@@ -70,9 +106,11 @@ CIFSParseMFSymlink(const u8 *buf,
        if (rc != 1)
                return -EINVAL;
-        cifs_MD5_init(&md5_ctx);
+        rc = symlink_hash(link_len, link_str, md5_hash);
-        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        if (rc) {
-        cifs_MD5_final(md5_hash, &md5_ctx);
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
        snprintf(md5_str2, sizeof(md5_str2),
                 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +132,9 @@ CIFSParseMFSymlink(const u8 *buf,
 static int
 CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
+        int rc;
        unsigned int link_len;
        unsigned int ofs;
-        struct MD5Context md5_ctx;
        u8 md5_hash[16];
        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +145,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
        if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
                return -ENAMETOOLONG;
-        cifs_MD5_init(&md5_ctx);
+        rc = symlink_hash(link_len, link_str, md5_hash);
-        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        if (rc) {
-        cifs_MD5_final(md5_hash, &md5_ctx);
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
        snprintf(buf, buf_len,
                 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -524,10 +564,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
                              rc);
                } else {
-                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
-                        else
-                                direntry->d_op = &cifs_dentry_ops;
                        d_instantiate(direntry, newinode);
                }
        }
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d6..00000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a implementation of MD4 designed for use in the SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1997-1998.
-   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "cifsencrypt.h"
-/* NOTE: This code makes no attempt to be fast! */
-static __u32
-F(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | ((~X) & Z);
-}
-static __u32
-G(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | (X & Z) | (Y & Z);
-}
-static __u32
-H(__u32 X, __u32 Y, __u32 Z)
-{
-        return X ^ Y ^ Z;
-}
-static __u32
-lshift(__u32 x, int s)
-{
-        x &= 0xFFFFFFFF;
-        return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
-#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
-#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
-/* this applies md4 to 64 byte chunks */
-static void
-mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
-{
-        int j;
-        __u32 AA, BB, CC, DD;
-        __u32 X[16];
-        for (j = 0; j < 16; j++)
-                X[j] = M[j];
-        AA = *A;
-        BB = *B;
-        CC = *C;
-        DD = *D;
-        ROUND1(A, B, C, D, 0, 3);
-        ROUND1(D, A, B, C, 1, 7);
-        ROUND1(C, D, A, B, 2, 11);
-        ROUND1(B, C, D, A, 3, 19);
-        ROUND1(A, B, C, D, 4, 3);
-        ROUND1(D, A, B, C, 5, 7);
-        ROUND1(C, D, A, B, 6, 11);
-        ROUND1(B, C, D, A, 7, 19);
-        ROUND1(A, B, C, D, 8, 3);
-        ROUND1(D, A, B, C, 9, 7);
-        ROUND1(C, D, A, B, 10, 11);
-        ROUND1(B, C, D, A, 11, 19);
-        ROUND1(A, B, C, D, 12, 3);
-        ROUND1(D, A, B, C, 13, 7);
-        ROUND1(C, D, A, B, 14, 11);
-        ROUND1(B, C, D, A, 15, 19);
-        ROUND2(A, B, C, D, 0, 3);
-        ROUND2(D, A, B, C, 4, 5);
-        ROUND2(C, D, A, B, 8, 9);
-        ROUND2(B, C, D, A, 12, 13);
-        ROUND2(A, B, C, D, 1, 3);
-        ROUND2(D, A, B, C, 5, 5);
-        ROUND2(C, D, A, B, 9, 9);
-        ROUND2(B, C, D, A, 13, 13);
-        ROUND2(A, B, C, D, 2, 3);
-        ROUND2(D, A, B, C, 6, 5);
-        ROUND2(C, D, A, B, 10, 9);
-        ROUND2(B, C, D, A, 14, 13);
-        ROUND2(A, B, C, D, 3, 3);
-        ROUND2(D, A, B, C, 7, 5);
-        ROUND2(C, D, A, B, 11, 9);
-        ROUND2(B, C, D, A, 15, 13);
-        ROUND3(A, B, C, D, 0, 3);
-        ROUND3(D, A, B, C, 8, 9);
-        ROUND3(C, D, A, B, 4, 11);
-        ROUND3(B, C, D, A, 12, 15);
-        ROUND3(A, B, C, D, 2, 3);
-        ROUND3(D, A, B, C, 10, 9);
-        ROUND3(C, D, A, B, 6, 11);
-        ROUND3(B, C, D, A, 14, 15);
-        ROUND3(A, B, C, D, 1, 3);
-        ROUND3(D, A, B, C, 9, 9);
-        ROUND3(C, D, A, B, 5, 11);
-        ROUND3(B, C, D, A, 13, 15);
-        ROUND3(A, B, C, D, 3, 3);
-        ROUND3(D, A, B, C, 11, 9);
-        ROUND3(C, D, A, B, 7, 11);
-        ROUND3(B, C, D, A, 15, 15);
-        *A += AA;
-        *B += BB;
-        *C += CC;
-        *D += DD;
-        *A &= 0xFFFFFFFF;
-        *B &= 0xFFFFFFFF;
-        *C &= 0xFFFFFFFF;
-        *D &= 0xFFFFFFFF;
-        for (j = 0; j < 16; j++)
-                X[j] = 0;
-}
-static void
-copy64(__u32 *M, unsigned char *in)
-{
-        int i;
-        for (i = 0; i < 16; i++)
-                M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
-                    (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
-}
-static void
-copy4(unsigned char *out, __u32 x)
-{
-        out[0] = x & 0xFF;
-        out[1] = (x >> 8) & 0xFF;
-        out[2] = (x >> 16) & 0xFF;
-        out[3] = (x >> 24) & 0xFF;
-}
-/* produce a md4 message digest from data of length n bytes */
-void
-mdfour(unsigned char *out, unsigned char *in, int n)
-{
-        unsigned char buf[128];
-        __u32 M[16];
-        __u32 b = n * 8;
-        int i;
-        __u32 A = 0x67452301;
-        __u32 B = 0xefcdab89;
-        __u32 C = 0x98badcfe;
-        __u32 D = 0x10325476;
-        while (n > 64) {
-                copy64(M, in);
-                mdfour64(M, &A, &B, &C, &D);
-                in += 64;
-                n -= 64;
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        memcpy(buf, in, n);
-        buf[n] = 0x80;
-        if (n <= 55) {
-                copy4(buf + 56, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-        } else {
-                copy4(buf + 120, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-                copy64(M, buf + 64);
-                mdfour64(M, &A, &B, &C, &D);
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        copy64(M, buf);
-        copy4(out, A);
-        copy4(out + 4, B);
-        copy4(out + 8, C);
-        copy4(out + 12, D);
-        A = B = C = D = 0;
-}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c31..00000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call cifs_MD5_final, which
- * will fill a supplied 16-byte array with the digest.
- */
-/* This code slightly modified to fit into Samba by
-   abartlet@samba.org Jun 2001
-   and to fit the cifs vfs by
-   Steve French sfrench@us.ibm.com */
-#include <linux/string.h>
-#include "md5.h"
-static void MD5Transform(__u32 buf[4], __u32 const in[16]);
-/*
- * Note: this code is harmless on little-endian machines.
- */
-static void
-byteReverse(unsigned char *buf, unsigned longs)
-{
-        __u32 t;
-        do {
-                t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
-                    ((unsigned) buf[1] << 8 | buf[0]);
-                *(__u32 *) buf = t;
-                buf += 4;
-        } while (--longs);
-}
-/*
- * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void
-cifs_MD5_init(struct MD5Context *ctx)
-{
-        ctx->buf[0] = 0x67452301;
-        ctx->buf[1] = 0xefcdab89;
-        ctx->buf[2] = 0x98badcfe;
-        ctx->buf[3] = 0x10325476;
-        ctx->bits[0] = 0;
-        ctx->bits[1] = 0;
-}
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void
-cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
-{
-        register __u32 t;
-        /* Update bitcount */
-        t = ctx->bits[0];
-        if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
-                ctx->bits[1]++; /* Carry from low to high */
-        ctx->bits[1] += len >> 29;
-        t = (t >> 3) & 0x3f;    /* Bytes already in shsInfo->data */
-        /* Handle any leading odd-sized chunks */
-        if (t) {
-                unsigned char *p = (unsigned char *) ctx->in + t;
-                t = 64 - t;
-                if (len < t) {
-                        memmove(p, buf, len);
-                        return;
-                }
-                memmove(p, buf, t);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += t;
-                len -= t;
-        }
-        /* Process data in 64-byte chunks */
-        while (len >= 64) {
-                memmove(ctx->in, buf, 64);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += 64;
-                len -= 64;
-        }
-        /* Handle any remaining bytes of data. */
-        memmove(ctx->in, buf, len);
-}
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void
-cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
-{
-        unsigned int count;
-        unsigned char *p;
-        /* Compute number of bytes mod 64 */
-        count = (ctx->bits[0] >> 3) & 0x3F;
-        /* Set the first char of padding to 0x80.  This is safe since there is
-           always at least one byte free */
-        p = ctx->in + count;
-        *p++ = 0x80;
-        /* Bytes of padding needed to make 64 bytes */
-        count = 64 - 1 - count;
-        /* Pad out to 56 mod 64 */
-        if (count < 8) {
-                /* Two lots of padding:  Pad the first block to 64 bytes */
-                memset(p, 0, count);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                /* Now fill the next block with 56 bytes */
-                memset(ctx->in, 0, 56);
-        } else {
-                /* Pad block to 56 bytes */
-                memset(p, 0, count - 8);
-        }
-        byteReverse(ctx->in, 14);
-        /* Append length in bits and transform */
-        ((__u32 *) ctx->in)[14] = ctx->bits[0];
-        ((__u32 *) ctx->in)[15] = ctx->bits[1];
-        MD5Transform(ctx->buf, (__u32 *) ctx->in);
-        byteReverse((unsigned char *) ctx->buf, 4);
-        memmove(digest, ctx->buf, 16);
-        memset(ctx, 0, sizeof(*ctx));   /* In case it's sensitive */
-}
-/* The four core functions - F1 is optimized somewhat */
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
-        (w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
- * the data and converts bytes into longwords for this routine.
- */
-static void
-MD5Transform(__u32 buf[4], __u32 const in[16])
-{
-        register __u32 a, b, c, d;
-        a = buf[0];
-        b = buf[1];
-        c = buf[2];
-        d = buf[3];
-        MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-        MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-        MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-        MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-        MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-        MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-        MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-        MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-        MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-        MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-        MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-        MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-        MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-        MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-        MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-        MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-        MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-        MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-        MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-        MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-        MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-        MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-        MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-        MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-        MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-        MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-        MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-        MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-        MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-        MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-        MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-        MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-        MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-        MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-        MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-        MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-        MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-        MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-        MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-        MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-        MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-        MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-        MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-        MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-        MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-        MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-        MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-        MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-        MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-        MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-        MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-        MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-        MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-        MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-        MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-        MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-        MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-        MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-        MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-        MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-        MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-        MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-        MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-        MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-        buf[0] += a;
-        buf[1] += b;
-        buf[2] += c;
-        buf[3] += d;
-}
-#if 0   /* currently unused */
-/***********************************************************************
- the rfc 2104 version of hmac_md5 initialisation.
-***********************************************************************/
-static void
-hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                      struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes reset it to key=MD5(key) */
-        if (key_len > 64) {
-                unsigned char tk[16];
-                struct MD5Context tctx;
-                cifs_MD5_init(&tctx);
-                cifs_MD5_update(&tctx, key, key_len);
-                cifs_MD5_final(tk, &tctx);
-                key = tk;
-                key_len = 16;
-        }
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-#endif
-/***********************************************************************
- the microsoft version of hmac_md5 initialisation.
-***********************************************************************/
-void
-hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                         struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes truncate it */
-        if (key_len > 64)
-                key_len = 64;
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-/***********************************************************************
- update hmac_md5 "inner" buffer
-***********************************************************************/
-void
-hmac_md5_update(const unsigned char *text, int text_len,
-                struct HMACMD5Context *ctx)
-{
-        cifs_MD5_update(&ctx->ctx, text, text_len);     /* then text of datagram */
-}
-/***********************************************************************
- finish off hmac_md5 "inner" buffer and generate outer one.
-***********************************************************************/
-void
-hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
-{
-        struct MD5Context ctx_o;
-        cifs_MD5_final(digest, &ctx->ctx);
-        cifs_MD5_init(&ctx_o);
-        cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-        cifs_MD5_update(&ctx_o, digest, 16);
-        cifs_MD5_final(digest, &ctx_o);
-}
-/***********************************************************
- single function to calculate an HMAC MD5 digest from data.
- use the microsoft hmacmd5 init method because the key is 16 bytes.
-************************************************************/
-#if 0 /* currently unused */
-static void
-hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-         unsigned char *digest)
-{
-        struct HMACMD5Context ctx;
-        hmac_md5_init_limK_to_64(key, 16, &ctx);
-        if (data_len != 0)
-                hmac_md5_update(data, data_len, &ctx);
-        hmac_md5_final(digest, &ctx);
-}
-#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402f..00000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef MD5_H
-#define MD5_H
-#ifndef HEADER_MD5_H
-/* Try to avoid clashes with OpenSSL */
-#define HEADER_MD5_H
-#endif
-struct MD5Context {
-        __u32 buf[4];
-        __u32 bits[2];
-        unsigned char in[64];
-};
-#endif                          /* !MD5_H */
-#ifndef _HMAC_MD5_H
-struct HMACMD5Context {
-        struct MD5Context ctx;
-        unsigned char k_ipad[65];
-        unsigned char k_opad[65];
-};
-#endif                          /* _HMAC_MD5_H */
-void cifs_MD5_init(struct MD5Context *context);
-void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
-                        unsigned len);
-void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
-/* The following definitions come from lib/hmacmd5.c  */
-/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);*/
-void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_update(const unsigned char *text, int text_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-                        unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe351..a09e077ba92 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                cFYI(1, "file id match, oplock break");
                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
-                                pCifsInode->clientCanCacheAll = false;
-                                if (pSMB->OplockLevel == 0)
-                                        pCifsInode->clientCanCacheRead = false;
+                                cifs_set_oplock_level(pCifsInode,
+                                        pSMB->OplockLevel ? OPLOCK_READ : 0);
                                /*
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
@@ -638,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
        return;
 }
-/* Convert 16 bit Unicode pathname to wire format from string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-                 const struct nls_table *cp, int mapChars)
-{
-        int i, j, charlen;
-        int len_remaining = maxlen;
-        char src_char;
-        __u16 temp;
-        if (!mapChars)
-                return cifs_strtoUCS(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < maxlen; j++) {
-                src_char = source[i];
-                switch (src_char) {
-                        case 0:
-                                target[j] = 0;
-                                goto ctoUCS_out;
-                        case ':':
-                                target[j] = cpu_to_le16(UNI_COLON);
-                                break;
-                        case '*':
-                                target[j] = cpu_to_le16(UNI_ASTERIK);
-                                break;
-                        case '?':
-                                target[j] = cpu_to_le16(UNI_QUESTION);
-                                break;
-                        case '<':
-                                target[j] = cpu_to_le16(UNI_LESSTHAN);
-                                break;
-                        case '>':
-                                target[j] = cpu_to_le16(UNI_GRTRTHAN);
-                                break;
-                        case '|':
-                                target[j] = cpu_to_le16(UNI_PIPE);
-                                break;
-                        /* BB We can not handle remapping slash until
-                           all the calls to build_path_from_dentry
-                           are modified, as they use slash as separator BB */
-                        /* case '\\':
-                                target[j] = cpu_to_le16(UNI_SLASH);
-                                break;*/
-                        default:
-                                charlen = cp->char2uni(source+i,
-                                        len_remaining, &temp);
-                                /* if no match, use question mark, which
-                                at least in some cases servers as wild card */
-                                if (charlen < 1) {
-                                        target[j] = cpu_to_le16(0x003f);
-                                        charlen = 1;
-                                } else
-                                        target[j] = cpu_to_le16(temp);
-                                len_remaining -= charlen;
-                                /* character may take more than one byte in the
-                                   the source string, but will take exactly two
-                                   bytes in the target string */
-                                i += charlen;
-                                continue;
-                }
-                i++; /* move to next char in source string */
-                len_remaining--;
-        }
-ctoUCS_out:
-        return i;
-}
 void
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
@@ -722,3 +650,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
                           cifs_sb_master_tcon(cifs_sb)->treeName);
        }
 }
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+        oplock &= 0xF;
+        if (oplock == OPLOCK_EXCLUSIVE) {
+                cinode->clientCanCacheAll = true;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Exclusive Oplock granted on inode %p",
+                     &cinode->vfs_inode);
+        } else if (oplock == OPLOCK_READ) {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Level II Oplock granted on inode %p",
+                    &cinode->vfs_inode);
+        } else {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = false;
+        }
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62..8d9189f6447 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, "Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
-                 smberrcode, rc);
+                 le32_to_cpu(smb->Status.CifsError), rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + BCC(ptr));
+                2 /* size of the bcc field */ + get_bcc(ptr));
 }
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
+                2 /* size of the bcc field */ + get_bcc_le(ptr));
 }
 /* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f5..7f25cc3d225 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
-                parent->d_op->d_hash(parent, name);
+                parent->d_op->d_hash(parent, parent->d_inode, name);
        else
                name->hash = full_name_hash(name->name, name->len);
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
                return NULL;
        }
-        if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-                dentry->d_op = &cifs_ci_dentry_ops;
-        else
-                dentry->d_op = &cifs_dentry_ops;
        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
                dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -226,26 +222,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
        char *full_path = NULL;
        struct cifsFileInfo *cifsFile;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        struct tcon_link *tlink;
+        struct tcon_link *tlink = NULL;
        struct cifsTconInfo *pTcon;
-        tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        pTcon = tlink_tcon(tlink);
-        if (file->private_data == NULL)
-                file->private_data =
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
        if (file->private_data == NULL) {
-                rc = -ENOMEM;
+                tlink = cifs_sb_tlink(cifs_sb);
-                goto error_exit;
+                if (IS_ERR(tlink))
+                        return PTR_ERR(tlink);
+                cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                if (cifsFile == NULL) {
+                        rc = -ENOMEM;
+                        goto error_exit;
+                }
+                file->private_data = cifsFile;
+                cifsFile->tlink = cifs_get_tlink(tlink);
+                pTcon = tlink_tcon(tlink);
+        } else {
+                cifsFile = file->private_data;
+                pTcon = tlink_tcon(cifsFile->tlink);
        }
-        cifsFile = file->private_data;
        cifsFile->invalidHandle = true;
        cifsFile->srch_inf.endOfSearch = false;
-        cifsFile->tlink = cifs_get_tlink(tlink);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
@@ -756,18 +755,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
                     ino, fattr.cf_dtype);
-        /*
-         * we can not return filldir errors to the caller since they are
-         * "normal" when the stat blocksize is too small - we return remapped
-         * error instead
-         *
-         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
-         * case already. Why should we be clobbering other errors from it?
-         */
-        if (rc) {
-                cFYI(1, "filldir rc = %d", rc);
-                rc = -EOVERFLOW;
-        }
        dput(tmp_dentry);
        return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed..1adc9625a34 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
                      const struct nls_table *nls_cp)
 {
        int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        return;
 }
-static int decode_ascii_ssetup(char **pbcc_area, int bleft,
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
                               struct cifsSesInfo *ses,
                               const struct nls_table *nls_cp)
 {
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
+        memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmNegotiate;
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        }
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->WorkstationName.BufferOffset = 0;
        sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->LmChallengeResponse.BufferOffset =
                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+        if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
-                        !calc_seckey(ses)) {
+                (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+                        && !calc_seckey(ses)) {
                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
        return rc;
 }
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
-                                 struct cifsSesInfo *ses)
-{
-        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-        return;
-}
-#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
               const struct nls_table *nls_cp)
@@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
        char *str_area;
        SESSION_SETUP_ANDX *pSMB;
        __u32 capabilities;
-        int count;
+        __u16 count;
        int resp_buf_type;
        struct kvec iov[3];
        enum securityEnum type;
-        __u16 action;
+        __u16 action, bytes_remaining;
-        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
        u16 blob_len;
@@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate:
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-        } else {
+        } else if (type == RawNTLMSSP) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                if (type == RawNTLMSSP) {
+                        cERROR(1, "NTLMSSP requires Unicode support");
-                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                        rc = -ENOSYS;
-                                cERROR(1, "NTLMSSP requires Unicode support");
+                        goto ssetup_exit;
-                                rc = -ENOSYS;
+                }
+                cFYI(1, "ntlmssp session setup phase %d", phase);
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                switch(phase) {
+                case NtLmNegotiate:
+                        build_ntlmssp_negotiate_blob(
+                                pSMB->req.SecurityBlob, ses);
+                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        iov[1].iov_base = pSMB->req.SecurityBlob;
+                        pSMB->req.SecurityBlobLength =
+                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                        break;
+                case NtLmAuthenticate:
+                        /*
+                         * 5 is an empirical value, large enough to hold
+                         * authenticate message plus max 10 of av paris,
+                         * domain, user, workstation names, flags, etc.
+                         */
+                        ntlmsspblob = kzalloc(
+                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+                        if (!ntlmsspblob) {
+                                cERROR(1, "Can't allocate NTLMSSP blob");
+                                rc = -ENOMEM;
                                goto ssetup_exit;
                        }
-                        cFYI(1, "ntlmssp session setup phase %d", phase);
+                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                                                &blob_len, ses, nls_cp);
-                        capabilities |= CAP_EXTENDED_SECURITY;
+                        if (rc)
-                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-                        if (phase == NtLmNegotiate) {
-                                setup_ntlmssp_neg_req(pSMB, ses);
-                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
-                        } else if (phase == NtLmAuthenticate) {
-                                /* 5 is an empirical value, large enought to
-                                 * hold authenticate message, max 10 of
-                                 * av paris, doamin,user,workstation mames,
-                                 * flags etc..
-                                 */
-                                ntlmsspblob = kmalloc(
-                                        5*sizeof(struct _AUTHENTICATE_MESSAGE),
-                                        GFP_KERNEL);
-                                if (!ntlmsspblob) {
-                                        cERROR(1, "Can't allocate NTLMSSP");
-                                        rc = -ENOMEM;
-                                        goto ssetup_exit;
-                                }
-                                rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                                                        &blob_len, ses, nls_cp);
-                                if (rc)
-                                        goto ssetup_exit;
-                                iov[1].iov_len = blob_len;
-                                iov[1].iov_base = ntlmsspblob;
-                                pSMB->req.SecurityBlobLength =
-                                        cpu_to_le16(blob_len);
-                                /* Make sure that we tell the server that we
-                                   are using the uid that it just gave us back
-                                   on the response (challenge) */
-                                smb_buf->Uid = ses->Suid;
-                        } else {
-                                cERROR(1, "invalid phase %d", phase);
-                                rc = -ENOSYS;
                                goto ssetup_exit;
-                        }
+                        iov[1].iov_len = blob_len;
-                        /* unicode strings must be word aligned */
+                        iov[1].iov_base = ntlmsspblob;
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
-                                *bcc_ptr = 0;
+                        /*
-                                bcc_ptr++;
+                         * Make sure that we tell the server that we are using
-                        }
+                         * the uid that it just gave us back on the response
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                         * (challenge)
-                } else {
+                         */
-                        cERROR(1, "secType %d not supported!", type);
+                        smb_buf->Uid = ses->Suid;
+                        break;
+                default:
+                        cERROR(1, "invalid phase %d", phase);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
-#else
+                /* unicode strings must be word aligned */
+                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, nls_cp);
+        } else {
                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
-#endif
        }
        iov[2].iov_base = str_area;
@@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate:
        count = iov[1].iov_len + iov[2].iov_len;
        smb_buf->smb_buf_length += count;
-        BCC_LE(smb_buf) = cpu_to_le16(count);
+        put_bcc_le(count, smb_buf);
        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+                          CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate:
        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
-        bytes_remaining = BCC(smb_buf);
+        bytes_remaining = get_bcc(smb_buf);
        bcc_ptr = pByteArea(smb_buf);
        if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500b..04721485925 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
   up with a different answer to the one above)
 */
 #include <linux/slab.h>
-#include "cifsencrypt.h"
 #define uchar unsigned char
 static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20..b5450e9f40c 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
 #include "cifs_unicode.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
-#include "md5.h"
 #include "cifs_debug.h"
-#include "cifsencrypt.h"
+#include "cifsproto.h"
 #ifndef false
 #define false 0
@@ -48,14 +47,57 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
-/*The following definitions come from  libsmb/smbencrypt.c  */
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md4;
+        struct sdesc *sdescmd4;
+        md4 = crypto_alloc_shash("md4", 0, 0);
+        if (IS_ERR(md4)) {
+                cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+                return PTR_ERR(md4);
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+        sdescmd4 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd4) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto mdfour_err;
+        }
+        sdescmd4->shash.tfm = md4;
+        sdescmd4->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd4->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md4 shash\n", __func__);
+                goto mdfour_err;
+        }
+        crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
-void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+mdfour_err:
-                unsigned char *p24);
+        crypto_free_shash(md4);
-void E_md4hash(const unsigned char *passwd, unsigned char *p16);
+        kfree(sdescmd4);
-static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-                   unsigned char p24[24]);
+        return rc;
-void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+}
+/* Does the des encryption from the NT or LM MD4 hash. */
+static void
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
+              unsigned char p24[24])
+{
+        unsigned char p21[21];
+        memset(p21, '\0', 21);
+        memcpy(p21, passwd, 16);
+        E_P24(p21, c8, p24);
+}
 /*
   This implements the X/Open SMB password encryption
@@ -118,9 +160,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
 * Creates the MD4 Hash of the users password in NT UNICODE.
 */
-void
+int
 E_md4hash(const unsigned char *passwd, unsigned char *p16)
 {
+        int rc;
        int len;
        __u16 wpwd[129];
@@ -139,8 +182,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        /* Calculate length in bytes */
        len = _my_wcslen(wpwd) * sizeof(__u16);
-        mdfour(p16, (unsigned char *) wpwd, len);
+        rc = mdfour(p16, (unsigned char *) wpwd, len);
        memset(wpwd, 0, 129 * 2);
+        return rc;
 }
 #if 0 /* currently unused */
@@ -212,19 +257,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 }
 #endif
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-              unsigned char p24[24])
-{
-        unsigned char p21[21];
-        memset(p21, '\0', 21);
-        memcpy(p21, passwd, 16);
-        E_P24(p21, c8, p24);
-}
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
 #if 0 /* currently unused */
 static void
@@ -242,16 +274,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 #endif
 /* Does the NT MD4 hash then des encryption. */
+int
-void
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
+        int rc;
        unsigned char p21[21];
        memset(p21, '\0', 21);
-        E_md4hash(passwd, p21);
+        rc = E_md4hash(passwd, p21);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
        SMBOWFencrypt(p21, c8, p24);
+        return rc;
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc..c1ccca1a933 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
 extern mempool_t *cifs_mid_poolp;
-static struct mid_q_entry *
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+        wake_up_process(mid->callback_data);
+}
+struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
        struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
-                temp->tsk = current;
+                /*
+                 * The default is for the mid to be synchronous, so the
+                 * default callback just wakes up the current task.
+                 */
+                temp->callback = wake_up_task;
+                temp->callback_data = current;
        }
-        spin_lock(&GlobalMid_Lock);
-        list_add_tail(&temp->qhead, &server->pending_mid_q);
        atomic_inc(&midCount);
        temp->midState = MID_REQUEST_ALLOCATED;
-        spin_unlock(&GlobalMid_Lock);
        return temp;
 }
-static void
+void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
        unsigned long now;
 #endif
-        spin_lock(&GlobalMid_Lock);
        midEntry->midState = MID_FREE;
-        list_del(&midEntry->qhead);
        atomic_dec(&midCount);
-        spin_unlock(&GlobalMid_Lock);
        if (midEntry->largeBuf)
                cifs_buf_release(midEntry->resp_buf);
        else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
        mempool_free(midEntry, cifs_mid_poolp);
 }
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+        spin_lock(&GlobalMid_Lock);
+        list_del(&mid->qhead);
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+}
 static int
 smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
        return smb_sendv(server, &iov, 1);
 }
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+                                 const int long_op)
 {
        if (long_op == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
-                atomic_inc(&ses->server->inFlight);
+                atomic_inc(&server->inFlight);
                return 0;
        }
        spin_lock(&GlobalMid_Lock);
        while (1) {
-                if (atomic_read(&ses->server->inFlight) >=
+                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
-                                cifs_max_pending){
                        spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_inc(&ses->server->num_waiters);
+                        atomic_inc(&server->num_waiters);
 #endif
-                        wait_event(ses->server->request_q,
+                        wait_event(server->request_q,
-                                   atomic_read(&ses->server->inFlight)
+                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_dec(&ses->server->num_waiters);
+                        atomic_dec(&server->num_waiters);
 #endif
                        spin_lock(&GlobalMid_Lock);
                } else {
-                        if (ses->server->tcpStatus == CifsExiting) {
+                        if (server->tcpStatus == CifsExiting) {
                                spin_unlock(&GlobalMid_Lock);
                                return -ENOENT;
                        }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
                        /* update # of requests on the wire to server */
                        if (long_op != CIFS_BLOCKING_OP)
-                                atomic_inc(&ses->server->inFlight);
+                                atomic_inc(&server->inFlight);
                        spin_unlock(&GlobalMid_Lock);
                        break;
                }
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        *ppmidQ = AllocMidQEntry(in_buf, ses->server);
        if (*ppmidQ == NULL)
                return -ENOMEM;
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
        return 0;
 }
-static int wait_for_response(struct cifsSesInfo *ses,
+static int
-                        struct mid_q_entry *midQ,
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
-                        unsigned long timeout,
-                        unsigned long time_to_wait)
 {
-        unsigned long curr_timeout;
+        int error;
-        for (;;) {
+        error = wait_event_killable(server->response_q,
-                curr_timeout = timeout + jiffies;
+                                    midQ->midState != MID_REQUEST_SUBMITTED);
-                wait_event_timeout(ses->server->response_q,
+        if (error < 0)
-                        midQ->midState != MID_REQUEST_SUBMITTED, timeout);
+                return -ERESTARTSYS;
-                if (time_after(jiffies, curr_timeout) &&
+        return 0;
-                        (midQ->midState == MID_REQUEST_SUBMITTED) &&
+}
-                        ((ses->server->tcpStatus == CifsGood) ||
-                         (ses->server->tcpStatus == CifsNew))) {
-                        unsigned long lrt;
-                        /* We timed out. Is the server still
+/*
-                           sending replies ? */
+ * Send a SMB request and set the callback function in the mid to handle
-                        spin_lock(&GlobalMid_Lock);
+ * the result. Caller is responsible for dealing with timeouts.
-                        lrt = ses->server->lstrp;
+ */
-                        spin_unlock(&GlobalMid_Lock);
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                mid_callback_t *callback, void *cbdata)
+{
+        int rc;
+        struct mid_q_entry *mid;
-                        /* Calculate time_to_wait past last receive time.
+        rc = wait_for_free_request(server, CIFS_ASYNC_OP);
-                         Although we prefer not to time out if the
+        if (rc)
-                         server is still responding - we will time
+                return rc;
-                         out if the server takes more than 15 (or 45
-                         or 180) seconds to respond to this request
+        mutex_lock(&server->srv_mutex);
-                         and has not responded to any request from
+        mid = AllocMidQEntry(in_buf, server);
-                         other threads on the client within 10 seconds */
+        if (mid == NULL) {
-                        lrt += time_to_wait;
+                mutex_unlock(&server->srv_mutex);
-                        if (time_after(jiffies, lrt)) {
+                return -ENOMEM;
-                                /* No replies for time_to_wait. */
-                                cERROR(1, "server not responding");
-                                return -1;
-                        }
-                } else {
-                        return 0;
-                }
        }
-}
+        /* put it on the pending_mid_q */
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&mid->qhead, &server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                goto out_err;
+        }
+        mid->callback = callback;
+        mid->callback_data = cbdata;
+        mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+        atomic_inc(&server->inSend);
+#endif
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+        atomic_dec(&server->inSend);
+        mid->when_sent = jiffies;
+#endif
+        mutex_unlock(&server->srv_mutex);
+        if (rc)
+                goto out_err;
+        return rc;
+out_err:
+        delete_mid(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+        return rc;
+}
 /*
 *
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        return rc;
 }
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+                mid->mid, mid->midState);
+        spin_lock(&GlobalMid_Lock);
+        /* ensure that it's no longer on the pending_mid_q */
+        list_del_init(&mid->qhead);
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                spin_unlock(&GlobalMid_Lock);
+                return rc;
+        case MID_REQUEST_SUBMITTED:
+                /* socket is going down, reject all calls */
+                if (server->tcpStatus == CifsExiting) {
+                        cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+                               __func__, mid->mid, mid->command, mid->midState);
+                        rc = -EHOSTDOWN;
+                        break;
+                }
+        case MID_RETRY_NEEDED:
+                rc = -EAGAIN;
+                break;
+        default:
+                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+                        mid->mid, mid->midState);
+                rc = -EIO;
+        }
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+        return rc;
+}
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                struct mid_q_entry *mid)
+{
+        int rc = 0;
+        /* -4 for RFC1001 length and +2 for BCC field */
+        in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+        in_buf->Command = SMB_COM_NT_CANCEL;
+        in_buf->WordCount = 0;
+        put_bcc_le(0, in_buf);
+        mutex_lock(&server->srv_mutex);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                return rc;
+        }
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+        mutex_unlock(&server->srv_mutex);
+        cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+                in_buf->Mid, rc);
+        return rc;
+}
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        int rc = 0;
        int long_op;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc) {
                cifs_small_buf_release(in_buf);
                return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == CIFS_STD_OP)
+        if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
-        else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_ASYNC_OP)
                goto out;
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
-                goto out;
-        }
-        /* wait for 15 seconds or until woken up due to response arriving or
-           due to last connection to this server being unmounted */
-        if (signal_pending(current)) {
-                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a chance to complete */
-                timeout = 2 * HZ;
-        }
-        /* No user interrupts in wait - wreaks havoc with performance */
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
-        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf == NULL) {
+        rc = wait_for_response(ses->server, midQ);
-                cERROR(1, "No response to cmd %d mid %d",
+        if (rc != 0)
-                        midQ->command, midQ->mid);
+                goto out;
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
+        rc = sync_mid_result(midQ, ses->server);
-                        if (midQ->midState == MID_RETRY_NEEDED) {
+        if (rc != 0) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
-                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-                        BCC(midQ->resp_buf) =
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-                                le16_to_cpu(BCC_LE(midQ->resp_buf));
                if ((flags & CIFS_NO_RESP) == 0)
                        midQ->resp_buf = NULL;  /* mark it so buf will
                                                   not be freed by
-                                                   DeleteMidQEntry */
+                                                   delete_mid */
        } else {
                rc = -EIO;
                cFYI(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
@@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 {
        int rc = 0;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        if (ses == NULL) {
@@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc)
                return rc;
@@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == CIFS_STD_OP)
+        if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
-        /* wait for 15 seconds or until woken up due to response arriving or
-           due to last connection to this server being unmounted */
-        else if (long_op == CIFS_ASYNC_OP)
                goto out;
-        else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
-                goto out;
-        }
-        if (signal_pending(current)) {
+        rc = wait_for_response(ses->server, midQ);
-                /* if signal pending do not hold up user for full smb timeout
+        if (rc != 0)
-                but we still give response a chance to complete */
+                goto out;
-                timeout = 2 * HZ;
-        }
-        /* No user interrupts in wait - wreaks havoc with performance */
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
-        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf == NULL) {
-                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid);
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
+        rc = sync_mid_result(midQ, ses->server);
-                        if (midQ->midState == MID_RETRY_NEEDED) {
+        if (rc != 0) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
-                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * out_buf->WordCount) + 2 /* bcc */ )
-                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
        } else {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
        return rc;
 }
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
-                struct mid_q_entry *midQ)
-{
-        int rc = 0;
-        struct cifsSesInfo *ses = tcon->ses;
-        __u16 mid = in_buf->Mid;
-        header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
-        in_buf->Mid = mid;
-        mutex_lock(&ses->server->srv_mutex);
-        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-        if (rc) {
-                mutex_unlock(&ses->server->srv_mutex);
-                return rc;
-        }
-        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-        mutex_unlock(&ses->server->srv_mutex);
-        return rc;
-}
 /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
   blocking lock to return. */
@@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
        pSMB->hdr.Mid = GetNextMid(ses->server);
        return SendReceive(xid, ses, in_buf, out_buf,
-                        &bytes_returned, CIFS_STD_OP);
+                        &bytes_returned, 0);
 }
 int
@@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+        rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
        if (rc)
                return rc;
@@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
        if (rc) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
@@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                return rc;
        }
@@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (in_buf->Command == SMB_COM_TRANSACTION2) {
                        /* POSIX lock. We send a NT_CANCEL SMB to cause the
                           blocking lock to return. */
+                        rc = send_nt_cancel(ses->server, in_buf, midQ);
-                        rc = send_nt_cancel(tcon, in_buf, midQ);
                        if (rc) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                } else {
@@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                        /* If we get -ENOLCK back the lock may have
                           already been removed. Don't exit in this case. */
                        if (rc && rc != -ENOLCK) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                }
-                /* Wait 5 seconds for the response. */
+                if (wait_for_response(ses->server, midQ) == 0) {
-                if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
                        /* We got the response - restart system call. */
                        rstart = 1;
                }
        }
-        spin_lock(&GlobalMid_Lock);
+        rc = sync_mid_result(midQ, ses->server);
-        if (midQ->resp_buf) {
+        if (rc != 0)
-                spin_unlock(&GlobalMid_Lock);
-                receive_len = midQ->resp_buf->smb_buf_length;
-        } else {
-                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid);
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
-                        if (midQ->midState == MID_RETRY_NEEDED) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
                return rc;
-        }
+        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid);
@@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if (receive_len >= sizeof(struct smb_hdr) - 4
            /* do not count RFC1001 header */  +
            (2 * out_buf->WordCount) + 2 /* bcc */ )
-                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                put_bcc(get_bcc_le(out_buf), out_buf);
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        if (rstart && rc == -EACCES)
                return -ERESTARTSYS;
        return rc;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb4..eae2a149160 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_USER_PREFIX "user."
 #define CIFS_XATTR_SYSTEM_PREFIX "system."
 #define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX ".security"
+#define CIFS_XATTR_SECURITY_PREFIX "security."
 #define CIFS_XATTR_TRUSTED_PREFIX "trusted."
 #define XATTR_TRUSTED_PREFIX_LEN  8
 #define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-                else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        __u16 fid;
-                        int oplock = 0;
-                        struct cifs_ntsd *pacl = NULL;
-                        __u32 buflen = 0;
-                        if (experimEnabled)
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                        FILE_OPEN, GENERIC_READ, 0, &fid,
-                                        &oplock, NULL, cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* else rc is EOPNOTSUPP from above */
-                        if (rc == 0) {
-                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
-                                                      &buflen);
-                                CIFSSMBClose(xid, pTcon, fid);
-                        }
-                }
-#endif /* EXPERIMENTAL */
 #else
-                cFYI(1, "query POSIX ACL not supported yet");
+                cFYI(1, "Query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, "query POSIX default ACL not supported yet");
+                cFYI(1, "Query POSIX default ACL not supported yet");
-#endif
+#endif /* CONFIG_CIFS_POSIX */
+        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+                                strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+                        u32 acllen;
+                        struct cifs_ntsd *pacl;
+                        pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+                                                full_path, &acllen);
+                        if (IS_ERR(pacl)) {
+                                rc = PTR_ERR(pacl);
+                                cERROR(1, "%s: error %zd getting sec desc",
+                                                __func__, rc);
+                        } else {
+                                if (ea_value) {
+                                        if (acllen > buf_size)
+                                                acllen = -ERANGE;
+                                        else
+                                                memcpy(ea_value, pacl, acllen);
+                                }
+                                rc = acllen;
+                                kfree(pacl);
+                        }
+#else
+                cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
                cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 9060f08e70c..69015787618 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
 #include <linux/spinlock.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 static atomic_t permission_epoch = ATOMIC_INIT(0);
@@ -93,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
        struct list_head *child;
        struct dentry *de;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        list_for_each(child, &parent->d_subdirs)
        {
                de = list_entry(child, struct dentry, d_u.d_child);
@@ -102,7 +101,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
                        continue;
                coda_flag_inode(de->d_inode, flag);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return; 
 }
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c8..6475877b076 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
 #include <linux/time.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 {
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 00000000000..c910b5eb1ce
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 00000000000..e35071b1de0
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+        struct CodaFid     c_fid;       /* Coda identifier */
+        u_short            c_flags;     /* flags (see below) */
+        unsigned int       c_mapcount;  /* nr of times this inode is mapped */
+        unsigned int       c_cached_epoch; /* epoch for cached permissions */
+        vuid_t             c_uid;       /* fsuid for cached permissions */
+        unsigned int       c_cached_perm; /* cached access permissions */
+        spinlock_t         c_lock;
+        struct inode       vfs_inode;
+};
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+        int                cfi_magic;     /* magic number */
+        struct file       *cfi_container; /* container file for this cnode */
+        unsigned int       cfi_mapcount;  /* nr of times this file is mapped */
+};
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e..2bdbcc11b37 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
 #include <linux/string.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 /* initialize the debugging variables */
 int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 00000000000..9b0c5323890
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>         
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+extern const struct dentry_operations coda_dentry_operations;
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+/* inode to cnode access functions */
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+        return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+        return &(ITOC(inode)->c_fid);
+}
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+        return coda_f2s(&(ITOC(inode)->c_fid));
+}
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
+        cii->c_flags |= flag;
+        spin_unlock(&cii->c_lock);
+}               
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b3553960..2b8dae4d121 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,14 +18,14 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -60,7 +60,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
 {
        .d_revalidate   = coda_dentry_revalidate,
        .d_delete       = coda_dentry_delete,
@@ -125,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                return ERR_PTR(error);
 exit:
-        entry->d_op = &coda_dentry_operations;
        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
@@ -134,10 +132,13 @@ exit:
 }
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
@@ -541,9 +542,13 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode;
        struct coda_inode_info *cii;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = de->d_inode;
        if (!inode || coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
@@ -559,7 +564,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
-        if (atomic_read(&de->d_count) > 1)
+        if (de->d_count > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
@@ -577,7 +582,7 @@ out:
 * This is the callback from dput() when d_count is going to 0.
 * We use this to unhash dentries with bad inodes.
 */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
        int flags;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366..0433057be33 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5ea57c8c7f9..871b2771546 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
 #include <linux/vmalloc.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
        struct coda_inode_info *ei;
-        ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -56,11 +55,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
+static void coda_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, coda_i_callback);
+}
 static void init_once(void *foo)
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
@@ -186,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_d_op = &coda_dentry_operations;
        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7..6cbb3afb36d 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,12 +19,12 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
                        unsigned long user_data);
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
 };
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e..8f616e0e252 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 /* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b..ab94ef63cae 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
 #include <linux/pagemap.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab975..9727e0c5257 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
 #include <linux/vfs.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6..f6fd0a00e6c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 }
 /*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
 * should be checked against those from time to time
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
            __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
            __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
            __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-            __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+            __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+            __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+            __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
                return -EFAULT;
        return 0;
 }
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
        if (nr_segs > fast_segs) {
                ret = -ENOMEM;
                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (iov == NULL) {
+                if (iov == NULL)
-                        *ret_pointer = fast_pointer;
                        goto out;
-                }
        }
        *ret_pointer = iov;
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
+                        cond_resched();
                }
        }
        return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;
-#ifdef CONFIG_STACK_GROWSUP
+                                page = get_arg_page(bprm, pos, 1);
-                                ret = expand_stack_downwards(bprm->vma, pos);
+                                if (!page) {
-                                if (ret < 0) {
-                                        /* We've exceed the stack rlimit. */
-                                        ret = -E2BIG;
-                                        goto out;
-                                }
-#endif
-                                ret = get_user_pages(current, bprm->mm, pos,
-                                                     1, 1, 1, &page, NULL);
-                                if (ret <= 0) {
-                                        /* We've exceed the stack rlimit. */
                                        ret = -E2BIG;
                                        goto out;
                                }
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
+                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa..61abb638b4b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -43,7 +42,7 @@
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/blkdev.h>
@@ -837,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
 COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0..9febcdefdfd 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
 config CONFIGFS_FS
        tristate "Userspace-driven configuration filesystem"
-        depends on SYSFS
+        select SYSFS
        help
-          configfs is a ram-based filesystem that provides the converse
+          configfs is a RAM-based filesystem that provides the converse
          of sysfs's functionality. Where sysfs is a filesystem-based
          view of kernel objects, configfs is a filesystem-based manager
          of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df4..82bda8fdfc1 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
 extern const struct file_operations bin_fops;
 extern const struct inode_operations configfs_dir_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
 extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname);
@@ -120,7 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
        struct config_item * item = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_unhashed(dentry)) {
                struct configfs_dirent * sd = dentry->d_fsdata;
                if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
                } else
                        item = config_item_get(sd->s_element);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return item;
 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c69..90ff3cb10de 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,12 +67,12 @@ static void configfs_d_iput(struct dentry * dentry,
 * We _must_ delete our dentries on last dput, as the chain-to-parent
 * behavior is required to clear the parents of default_groups.
 */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
-static const struct dentry_operations configfs_dentry_ops = {
+const struct dentry_operations configfs_dentry_ops = {
        .d_iput         = configfs_d_iput,
        /* simple_delete_dentry() isn't exported */
        .d_delete       = configfs_d_delete,
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
        sd->s_mode = mode;
        sd->s_dentry = dentry;
-        if (dentry) {
+        if (dentry)
                dentry->d_fsdata = configfs_get(sd);
-                dentry->d_op = &configfs_dentry_ops;
-        }
        return 0;
 }
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
-                        (d)->d_op = &configfs_dentry_ops;
                } else {
                        struct configfs_dirent *sd = d->d_fsdata;
                        if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
                                   CONFIGFS_ITEM_LINK);
        if (!err) {
                err = configfs_create(dentry, mode, init_symlink);
-                if (!err)
+                if (err) {
-                        dentry->d_op = &configfs_dentry_ops;
-                else {
                        struct configfs_dirent *sd = dentry->d_fsdata;
                        if (sd) {
                                spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
-        pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
-                 atomic_read(&d->d_count));
        dput(parent);
 }
@@ -448,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                return error;
        }
-        dentry->d_op = &configfs_dentry_ops;
        d_rehash(dentry);
        return 0;
@@ -493,7 +486,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
                 * If it doesn't exist and it isn't a NOT_PINNED item,
                 * it must be negative.
                 */
-                return simple_lookup(dir, dentry, nd);
+                if (dentry->d_name.len > NAME_MAX)
+                        return ERR_PTR(-ENAMETOOLONG);
+                d_add(dentry, NULL);
+                return NULL;
        }
 out:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 253476d78ed..c83f4768eea 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
        struct dentry * dentry = sd->s_dentry;
        if (dentry) {
-                spin_lock(&dcache_lock);
                spin_lock(&dentry->d_lock);
                if (!(d_unhashed(dentry) && dentry->d_inode)) {
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        simple_unlink(parent->d_inode, dentry);
-                } else {
+                } else
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
-                }
        }
 }
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1..ecc62178bed 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        configfs_root_group.cg_item.ci_dentry = root;
        root->d_fsdata = &configfs_root;
        sb->s_root = root;
+        sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
        return 0;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0..e141939080f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
-/* These two macros may change in future, to provide better st_ino
+/* These macros may change in future, to provide better st_ino semantics. */
-   semantics. */
-#define CRAMINO(x)      (((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 {
+        if (!cino->offset)
+                return offset + 1;
+        if (!cino->size)
+                return offset + 1;
+        /*
+         * The file mode test fixes buggy mkcramfs implementations where
+         * cramfs_inode->offset is set to a non zero value for entries
+         * which did not contain data, like devices node and fifos.
+         */
+        switch (cino->mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                return cino->offset << 2;
+        default:
+                break;
+        }
+        return offset + 1;
+}
+static struct inode *get_cramfs_inode(struct super_block *sb,
+        struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+        struct inode *inode;
        static struct timespec zerotime;
+        inode = iget_locked(sb, cramino(cramfs_inode, offset));
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        switch (cramfs_inode->mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_fop = &generic_ro_fops;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        case S_IFDIR:
+                inode->i_op = &cramfs_dir_inode_operations;
+                inode->i_fop = &cramfs_directory_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        default:
+                init_special_inode(inode, cramfs_inode->mode,
+                                old_decode_dev(cramfs_inode->size));
+        }
        inode->i_mode = cramfs_inode->mode;
        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
        inode->i_gid = cramfs_inode->gid;
+        /* if the lower 2 bits are zero, the inode contains data */
+        if (!(inode->i_ino & 3)) {
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+        }
        /* Struct copy intentional */
        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        /* inode->i_nlink is left 1 - arguably wrong for directories,
           but it's the best we can do without reading the directory
           contents.  1 yields the right result in GNU find, even
           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
-}
-static struct inode *get_cramfs_inode(struct super_block *sb,
+        unlock_new_inode(inode);
-                                struct cramfs_inode * cramfs_inode)
-{
-        struct inode *inode;
-        if (CRAMINO(cramfs_inode) == 1) {
-                inode = new_inode(sb);
-                if (inode) {
-                        inode->i_ino = 1;
-                        setup_inode(inode, cramfs_inode);
-                }
-        } else {
-                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode && (inode->i_state & I_NEW)) {
-                        setup_inode(inode, cramfs_inode);
-                        unlock_new_inode(inode);
-                }
-        }
        return inode;
 }
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "cramfs: root is not a directory\n");
                goto out;
        }
+        /* correct strange, hard-coded permissions of mkcramfs */
+        super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        root_offset = super.root.offset << 2;
        if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
                sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Set it all up.. */
        sb->s_op = &cramfs_ops;
-        root = get_cramfs_inode(sb, &super.root);
+        root = get_cramfs_inode(sb, &super.root, 0);
        if (!root)
                goto out;
        sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                namelen = de->namelen << 2;
                memcpy(buf, name, namelen);
-                ino = CRAMINO(de);
+                ino = cramino(de, OFFSET(inode) + offset);
                mode = de->mode;
                mutex_unlock(&read_mutex);
                nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                struct cramfs_inode *de;
                char *name;
                int namelen, retval;
+                int dir_off = OFFSET(dir) + offset;
-                de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+                de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);
                /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                if (!retval) {
                        struct cramfs_inode entry = *de;
                        mutex_unlock(&read_mutex);
-                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
                        return NULL;
                }
                /* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9d4e6..2a6bd9a4ae9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
-EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(rename_lock);
 static struct kmem_cache *dentry_cache __read_mostly;
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 /*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,22 +98,51 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
+struct dcache_hash_bucket {
+        struct hlist_bl_head head;
+};
+static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
+static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+                                        unsigned long hash)
+{
+        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+        return dentry_hashtable + (hash & D_HASHMASK);
+}
+static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
+{
+        bit_spin_lock(0, (unsigned long *)&b->head.first);
+}
+static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
+{
+        __bit_spin_unlock(0, (unsigned long *)&b->head.first);
+}
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_dentry, i);
+        return sum < 0 ? 0 : sum;
+}
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos)
 {
-        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_dentry = get_nr_dentry();
-        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -91,35 +158,51 @@ static void __d_free(struct rcu_head *head)
 }
 /*
- * no dcache_lock, please.
+ * no locks, please.
 */
 static void d_free(struct dentry *dentry)
 {
-        percpu_counter_dec(&nr_dentry);
+        BUG_ON(dentry->d_count);
+        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
-        if (hlist_unhashed(&dentry->d_hash))
+        if (hlist_bl_unhashed(&dentry->d_hash))
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+        assert_spin_locked(&dentry->d_lock);
+        /* Go through a barrier */
+        write_seqcount_barrier(&dentry->d_seq);
+}
 /*
 * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
 */
 static void dentry_iput(struct dentry * dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
        if (inode) {
                dentry->d_inode = NULL;
                list_del_init(&dentry->d_alias);
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                if (!inode->i_nlink)
                        fsnotify_inoderemove(inode);
                if (dentry->d_op && dentry->d_op->d_iput)
@@ -128,65 +211,191 @@ static void dentry_iput(struct dentry * dentry)
                        iput(inode);
        } else {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
        }
 }
 /*
- * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dentry->d_inode->i_lock)
+{
+        struct inode *inode = dentry->d_inode;
+        dentry->d_inode = NULL;
+        list_del_init(&dentry->d_alias);
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&inode->i_lock);
+        if (!inode->i_nlink)
+                fsnotify_inoderemove(inode);
+        if (dentry->d_op && dentry->d_op->d_iput)
+                dentry->d_op->d_iput(dentry, inode);
+        else
+                iput(inode);
+}
+/*
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
        if (list_empty(&dentry->d_lru)) {
+                spin_lock(&dcache_lru_lock);
                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
+                spin_unlock(&dcache_lru_lock);
        }
 }
+static void __dentry_lru_del(struct dentry *dentry)
+{
+        list_del_init(&dentry->d_lru);
+        dentry->d_sb->s_nr_dentry_unused--;
+        dentry_stat.nr_unused--;
+}
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                spin_lock(&dcache_lru_lock);
-                dentry->d_sb->s_nr_dentry_unused--;
+                __dentry_lru_del(dentry);
-                percpu_counter_dec(&nr_dentry_unused);
+                spin_unlock(&dcache_lru_lock);
        }
 }
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
+        spin_lock(&dcache_lru_lock);
        if (list_empty(&dentry->d_lru)) {
                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
        } else {
                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
+        spin_unlock(&dcache_lru_lock);
 }
 /**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
+ * @parent: parent dentry
 *
 * The dentry must already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
+ *
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
 */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(parent->d_lock)
+        __releases(dentry->d_inode->i_lock)
 {
-        struct dentry *parent;
+        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
-        /*drops the locks, at that point nobody can reach this dentry */
+        if (parent)
+                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
+        /*
+         * dentry_iput drops the locks, at which point nobody (except
+         * transient RCU lookups) can reach this dentry.
+         */
+        d_free(dentry);
+        return parent;
+}
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+        if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+                        bit_spin_lock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_init(&dentry->d_hash);
+                        __bit_spin_unlock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                } else {
+                        struct dcache_hash_bucket *b;
+                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
+                        spin_lock_bucket(b);
+                        /*
+                         * We may not actually need to put DCACHE_UNHASHED
+                         * manipulations under the hash lock, but follow
+                         * the principle of least surprise.
+                         */
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_rcu(&dentry->d_hash);
+                        spin_unlock_bucket(b);
+                        dentry_rcuwalk_barrier(dentry);
+                }
+        }
+}
+EXPORT_SYMBOL(__d_drop);
+void d_drop(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __d_drop(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+        __releases(dentry->d_lock)
+{
+        struct inode *inode;
+        struct dentry *parent;
+        inode = dentry->d_inode;
+        if (inode && !spin_trylock(&inode->i_lock)) {
+relock:
+                spin_unlock(&dentry->d_lock);
+                cpu_relax();
+                return dentry; /* try again with same dentry */
+        }
        if (IS_ROOT(dentry))
                parent = NULL;
        else
                parent = dentry->d_parent;
-        d_free(dentry);
+        if (parent && !spin_trylock(&parent->d_lock)) {
-        return parent;
+                if (inode)
+                        spin_unlock(&inode->i_lock);
+                goto relock;
+        }
+        if (ref)
+                dentry->d_count--;
+        /* if dentry was on the d_lru list delete it from there */
+        dentry_lru_del(dentry);
+        /* if it was on the hash then remove it */
+        __d_drop(dentry);
+        return d_kill(dentry, parent);
 }
 /* 
@@ -214,34 +423,26 @@ static struct dentry *d_kill(struct dentry *dentry)
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
- *
- * no dcache lock, please.
 */
 void dput(struct dentry *dentry)
 {
        if (!dentry)
                return;
 repeat:
-        if (atomic_read(&dentry->d_count) == 1)
+        if (dentry->d_count == 1)
                might_sleep();
-        if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
-                return;
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count)) {
+        BUG_ON(!dentry->d_count);
+        if (dentry->d_count > 1) {
+                dentry->d_count--;
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                return;
        }
-        /*
+        if (dentry->d_flags & DCACHE_OP_DELETE) {
-         * AV: ->d_delete() is _NOT_ allowed to block now.
-         */
-        if (dentry->d_op && dentry->d_op->d_delete) {
                if (dentry->d_op->d_delete(dentry))
-                        goto unhash_it;
+                        goto kill_it;
        }
        /* Unreachable? Get rid of it */
@@ -252,16 +453,12 @@ repeat:
        dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
-        spin_unlock(&dentry->d_lock);
+        dentry->d_count--;
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return;
-unhash_it:
-        __d_drop(dentry);
 kill_it:
-        /* if dentry was on the d_lru list delete it from there */
+        dentry = dentry_kill(dentry, 1);
-        dentry_lru_del(dentry);
-        dentry = d_kill(dentry);
        if (dentry)
                goto repeat;
 }
@@ -284,9 +481,9 @@ int d_invalidate(struct dentry * dentry)
        /*
         * If it's already been dropped, return OK.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                return 0;
        }
        /*
@@ -294,9 +491,9 @@ int d_invalidate(struct dentry * dentry)
         * to get rid of unused child entries.
         */
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                shrink_dcache_parent(dentry);
-                spin_lock(&dcache_lock);
+                spin_lock(&dentry->d_lock);
        }
        /*
@@ -309,35 +506,61 @@ int d_invalidate(struct dentry * dentry)
         * we might still populate it if it was a
         * working directory or similar).
         */
-        spin_lock(&dentry->d_lock);
+        if (dentry->d_count > 1) {
-        if (atomic_read(&dentry->d_count) > 1) {
                if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        return -EBUSY;
                }
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
-static inline struct dentry * __dget_locked(struct dentry *dentry)
+static inline void __dget_dlock(struct dentry *dentry)
 {
-        atomic_inc(&dentry->d_count);
+        dentry->d_count++;
-        dentry_lru_del(dentry);
-        return dentry;
 }
-struct dentry * dget_locked(struct dentry *dentry)
+static inline void __dget(struct dentry *dentry)
 {
-        return __dget_locked(dentry);
+        spin_lock(&dentry->d_lock);
+        __dget_dlock(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+struct dentry *dget_parent(struct dentry *dentry)
+{
+        struct dentry *ret;
+repeat:
+        /*
+         * Don't need rcu_dereference because we re-check it was correct under
+         * the lock.
+         */
+        rcu_read_lock();
+        ret = dentry->d_parent;
+        if (!ret) {
+                rcu_read_unlock();
+                goto out;
+        }
+        spin_lock(&ret->d_lock);
+        if (unlikely(ret != dentry->d_parent)) {
+                spin_unlock(&ret->d_lock);
+                rcu_read_unlock();
+                goto repeat;
+        }
+        rcu_read_unlock();
+        BUG_ON(!ret->d_count);
+        ret->d_count++;
+        spin_unlock(&ret->d_lock);
+out:
+        return ret;
 }
-EXPORT_SYMBOL(dget_locked);
+EXPORT_SYMBOL(dget_parent);
 /**
 * d_find_alias - grab a hashed alias of inode
@@ -355,42 +578,51 @@ EXPORT_SYMBOL(dget_locked);
 * any other hashed alias over that one unless @want_discon is set,
 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
-static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
 {
-        struct list_head *head, *next, *tmp;
+        struct dentry *alias, *discon_alias;
-        struct dentry *alias, *discon_alias=NULL;
-        head = &inode->i_dentry;
+again:
-        next = inode->i_dentry.next;
+        discon_alias = NULL;
-        while (next != head) {
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-                tmp = next;
+                spin_lock(&alias->d_lock);
-                next = tmp->next;
-                prefetch(next);
-                alias = list_entry(tmp, struct dentry, d_alias);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
                        if (IS_ROOT(alias) &&
-                            (alias->d_flags & DCACHE_DISCONNECTED))
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                        else if (!want_discon) {
+                        } else if (!want_discon) {
-                                __dget_locked(alias);
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
+                                return alias;
+                        }
+                }
+                spin_unlock(&alias->d_lock);
+        }
+        if (discon_alias) {
+                alias = discon_alias;
+                spin_lock(&alias->d_lock);
+                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+                        if (IS_ROOT(alias) &&
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
                                return alias;
                        }
                }
+                spin_unlock(&alias->d_lock);
+                goto again;
        }
-        if (discon_alias)
+        return NULL;
-                __dget_locked(discon_alias);
-        return discon_alias;
 }
-struct dentry * d_find_alias(struct inode *inode)
+struct dentry *d_find_alias(struct inode *inode)
 {
        struct dentry *de = NULL;
        if (!list_empty(&inode->i_dentry)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode, 0);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
        }
        return de;
 }
@@ -404,54 +636,61 @@ void d_prune_aliases(struct inode *inode)
 {
        struct dentry *dentry;
 restart:
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
-                        __dget_locked(dentry);
+                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        dput(dentry);
                        goto restart;
                }
                spin_unlock(&dentry->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 /*
- * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * Try to throw away a dentry - free the inode, dput the parent.
- * the LRU list has already been removed.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
 *
- * Try to prune ancestors as well.  This is necessary to prevent
+ * This may fail if locks cannot be acquired no problem, just try again.
- * quadratic behavior of shrink_dcache_parent(), but is also expected
- * to be beneficial in reducing dentry cache fragmentation.
 */
-static void prune_one_dentry(struct dentry * dentry)
+static void try_prune_one_dentry(struct dentry *dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
-        __acquires(dcache_lock)
 {
-        __d_drop(dentry);
+        struct dentry *parent;
-        dentry = d_kill(dentry);
+        parent = dentry_kill(dentry, 0);
        /*
-         * Prune ancestors.  Locking is simpler than in dput(),
+         * If dentry_kill returns NULL, we have nothing more to do.
-         * because dcache_lock needs to be taken anyway.
+         * if it returns the same dentry, trylocks failed. In either
+         * case, just loop again.
+         *
+         * Otherwise, we need to prune ancestors too. This is necessary
+         * to prevent quadratic behavior of shrink_dcache_parent(), but
+         * is also expected to be beneficial in reducing dentry cache
+         * fragmentation.
         */
-        spin_lock(&dcache_lock);
+        if (!parent)
+                return;
+        if (parent == dentry)
+                return;
+        /* Prune ancestors. */
+        dentry = parent;
        while (dentry) {
-                if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_count > 1) {
+                        dentry->d_count--;
+                        spin_unlock(&dentry->d_lock);
                        return;
+                }
-                if (dentry->d_op && dentry->d_op->d_delete)
+                dentry = dentry_kill(dentry, 1);
-                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del(dentry);
-                __d_drop(dentry);
-                dentry = d_kill(dentry);
-                spin_lock(&dcache_lock);
        }
 }
@@ -459,24 +698,35 @@ static void shrink_dentry_list(struct list_head *list)
 {
        struct dentry *dentry;
-        while (!list_empty(list)) {
+        rcu_read_lock();
-                dentry = list_entry(list->prev, struct dentry, d_lru);
+        for (;;) {
-                dentry_lru_del(dentry);
+                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+                if (&dentry->d_lru == list)
+                        break; /* empty */
+                spin_lock(&dentry->d_lock);
+                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+                        spin_unlock(&dentry->d_lock);
+                        continue;
+                }
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
-                spin_lock(&dentry->d_lock);
+                if (dentry->d_count) {
-                if (atomic_read(&dentry->d_count)) {
+                        dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                prune_one_dentry(dentry);
-                /* dentry->d_lock was dropped in prune_one_dentry() */
+                rcu_read_unlock();
-                cond_resched_lock(&dcache_lock);
+                try_prune_one_dentry(dentry);
+                rcu_read_lock();
        }
+        rcu_read_unlock();
 }
 /**
@@ -495,42 +745,44 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
        LIST_HEAD(tmp);
        int cnt = *count;
-        spin_lock(&dcache_lock);
+relock:
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                dentry = list_entry(sb->s_dentry_lru.prev,
                                struct dentry, d_lru);
                BUG_ON(dentry->d_sb != sb);
+                if (!spin_trylock(&dentry->d_lock)) {
+                        spin_unlock(&dcache_lru_lock);
+                        cpu_relax();
+                        goto relock;
+                }
                /*
                 * If we are honouring the DCACHE_REFERENCED flag and the
                 * dentry has this flag set, don't free it.  Clear the flag
                 * and put it back on the LRU.
                 */
-                if (flags & DCACHE_REFERENCED) {
+                if (flags & DCACHE_REFERENCED &&
-                        spin_lock(&dentry->d_lock);
+                                dentry->d_flags & DCACHE_REFERENCED) {
-                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                        dentry->d_flags &= ~DCACHE_REFERENCED;
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                        list_move(&dentry->d_lru, &referenced);
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                                cond_resched_lock(&dcache_lock);
-                                continue;
-                        }
                        spin_unlock(&dentry->d_lock);
+                } else {
+                        list_move_tail(&dentry->d_lru, &tmp);
+                        spin_unlock(&dentry->d_lock);
+                        if (!--cnt)
+                                break;
                }
+                cond_resched_lock(&dcache_lru_lock);
-                list_move_tail(&dentry->d_lru, &tmp);
-                if (!--cnt)
-                        break;
-                cond_resched_lock(&dcache_lock);
        }
-        *count = cnt;
-        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
+        shrink_dentry_list(&tmp);
+        *count = cnt;
 }
 /**
@@ -546,13 +798,12 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        int unused = dentry_stat.nr_unused;
        int prune_ratio;
        int pruned;
        if (unused == 0 || count == 0)
                return;
-        spin_lock(&dcache_lock);
        if (count >= unused)
                prune_ratio = 1;
        else
@@ -589,11 +840,9 @@ static void prune_dcache(int count)
                if (down_read_trylock(&sb->s_umount)) {
                        if ((sb->s_root != NULL) &&
                            (!list_empty(&sb->s_dentry_lru))) {
-                                spin_unlock(&dcache_lock);
                                __shrink_dcache_sb(sb, &w_count,
                                                DCACHE_REFERENCED);
                                pruned -= w_count;
-                                spin_lock(&dcache_lock);
                        }
                        up_read(&sb->s_umount);
                }
@@ -609,7 +858,6 @@ static void prune_dcache(int count)
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
-        spin_unlock(&dcache_lock);
 }
 /**
@@ -623,12 +871,14 @@ void shrink_dcache_sb(struct super_block *sb)
 {
        LIST_HEAD(tmp);
-        spin_lock(&dcache_lock);
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                list_splice_init(&sb->s_dentry_lru, &tmp);
+                spin_unlock(&dcache_lru_lock);
                shrink_dentry_list(&tmp);
+                spin_lock(&dcache_lru_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -645,10 +895,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        BUG_ON(!IS_ROOT(dentry));
        /* detach this root from the system */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        dentry_lru_del(dentry);
        __d_drop(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        for (;;) {
                /* descend to the first leaf in the current subtree */
@@ -657,14 +907,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        /* this is a branch with children - detach all of them
                         * from the system in one go */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
+                                spin_lock_nested(&loop->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
                                dentry_lru_del(loop);
                                __d_drop(loop);
-                                cond_resched_lock(&dcache_lock);
+                                spin_unlock(&loop->d_lock);
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                        /* move to the first child */
                        dentry = list_entry(dentry->d_subdirs.next,
@@ -676,7 +928,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                do {
                        struct inode *inode;
-                        if (atomic_read(&dentry->d_count) != 0) {
+                        if (dentry->d_count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
                                       " still in use (%d)"
@@ -685,20 +937,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                                       dentry->d_inode ?
                                       dentry->d_inode->i_ino : 0UL,
                                       dentry->d_name.name,
-                                       atomic_read(&dentry->d_count),
+                                       dentry->d_count,
                                       dentry->d_sb->s_type->name,
                                       dentry->d_sb->s_id);
                                BUG();
                        }
-                        if (IS_ROOT(dentry))
+                        if (IS_ROOT(dentry)) {
                                parent = NULL;
-                        else {
+                                list_del(&dentry->d_u.d_child);
+                        } else {
                                parent = dentry->d_parent;
-                                atomic_dec(&parent->d_count);
+                                spin_lock(&parent->d_lock);
+                                parent->d_count--;
+                                list_del(&dentry->d_u.d_child);
+                                spin_unlock(&parent->d_lock);
                        }
-                        list_del(&dentry->d_u.d_child);
                        detached++;
                        inode = dentry->d_inode;
@@ -728,8 +983,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 /*
 * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ * - we don't need to use dentry->d_lock because:
- *   removing the dentry from the system lists and hashes because:
 *   - the superblock is detached from all mountings and open files, so the
 *     dentry trees will not be rearranged by the VFS
 *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -746,11 +1000,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
        dentry = sb->s_root;
        sb->s_root = NULL;
-        atomic_dec(&dentry->d_count);
+        spin_lock(&dentry->d_lock);
+        dentry->d_count--;
+        spin_unlock(&dentry->d_lock);
        shrink_dcache_for_umount_subtree(dentry);
-        while (!hlist_empty(&sb->s_anon)) {
+        while (!hlist_bl_empty(&sb->s_anon)) {
-                dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+                dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
                shrink_dcache_for_umount_subtree(dentry);
        }
 }
@@ -768,15 +1024,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
 * Return true if the parent or its subdirectories contain
 * a mount point
 */
- 
 int have_submounts(struct dentry *parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
-        spin_lock(&dcache_lock);
        if (d_mountpoint(parent))
                goto positive;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -784,27 +1045,65 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* Have we found a mount point ? */
-                if (d_mountpoint(dentry))
+                if (d_mountpoint(dentry)) {
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&this_parent->d_lock);
                        goto positive;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 0; /* No mount points found in tree */
 positive:
-        spin_unlock(&dcache_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 1;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -824,11 +1123,16 @@ EXPORT_SYMBOL(have_submounts);
 */
 static int select_parent(struct dentry * parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
        int found = 0;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -837,11 +1141,13 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
                        dentry_lru_move_tail(dentry);
                        found++;
                } else {
@@ -853,28 +1159,63 @@ resume:
                 * ensures forward progress). We'll be coming back to find
                 * the rest.
                 */
-                if (found && need_resched())
+                if (found && need_resched()) {
+                        spin_unlock(&dentry->d_lock);
                        goto out;
+                }
                /*
                 * Descend a level if the d_subdirs list is non-empty.
                 */
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                        (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return found;
+rename_retry:
+        if (found)
+                return found;
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -908,16 +1249,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
-        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -960,38 +1298,54 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;
-        atomic_set(&dentry->d_count, 1);
+        dentry->d_count = 1;
        dentry->d_flags = DCACHE_UNHASHED;
        spin_lock_init(&dentry->d_lock);
+        seqcount_init(&dentry->d_seq);
        dentry->d_inode = NULL;
        dentry->d_parent = NULL;
        dentry->d_sb = NULL;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
-        dentry->d_mounted = 0;
+        INIT_HLIST_BL_NODE(&dentry->d_hash);
-        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_LIST_HEAD(&dentry->d_alias);
+        INIT_LIST_HEAD(&dentry->d_u.d_child);
        if (parent) {
-                dentry->d_parent = dget(parent);
+                spin_lock(&parent->d_lock);
+                /*
+                 * don't need child lock because it is not subject
+                 * to concurrency here
+                 */
+                __dget_dlock(parent);
+                dentry->d_parent = parent;
                dentry->d_sb = parent->d_sb;
-        } else {
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
-                INIT_LIST_HEAD(&dentry->d_u.d_child);
-        }
-        spin_lock(&dcache_lock);
-        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        spin_unlock(&dcache_lock);
+                spin_unlock(&parent->d_lock);
+        }
-        percpu_counter_inc(&nr_dentry);
+        this_cpu_inc(nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+        struct dentry *dentry = d_alloc(NULL, name);
+        if (dentry) {
+                dentry->d_sb = sb;
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
+                dentry->d_parent = dentry;
+                dentry->d_flags |= DCACHE_DISCONNECTED;
+        }
+        return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
        struct qstr q;
@@ -1003,12 +1357,39 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
-/* the caller must hold dcache_lock */
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+        WARN_ON_ONCE(dentry->d_op);
+        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH  |
+                                DCACHE_OP_COMPARE       |
+                                DCACHE_OP_REVALIDATE    |
+                                DCACHE_OP_DELETE ));
+        dentry->d_op = op;
+        if (!op)
+                return;
+        if (op->d_hash)
+                dentry->d_flags |= DCACHE_OP_HASH;
+        if (op->d_compare)
+                dentry->d_flags |= DCACHE_OP_COMPARE;
+        if (op->d_revalidate)
+                dentry->d_flags |= DCACHE_OP_REVALIDATE;
+        if (op->d_delete)
+                dentry->d_flags |= DCACHE_OP_DELETE;
+}
+EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
-        if (inode)
+        spin_lock(&dentry->d_lock);
+        if (inode) {
+                if (unlikely(IS_AUTOMOUNT(inode)))
+                        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
                list_add(&dentry->d_alias, &inode->i_dentry);
+        }
        dentry->d_inode = inode;
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
 }
@@ -1030,9 +1411,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1069,15 +1452,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
                struct qstr *qstr = &alias->d_name;
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
                if (qstr->hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
-                if (qstr->len != len)
+                if (dentry_cmp(qstr->name, qstr->len, name, len))
                        continue;
-                if (memcmp(qstr->name, name, len))
+                __dget(alias);
-                        continue;
-                dget_locked(alias);
                return alias;
        }
@@ -1091,9 +1477,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        result = __d_instantiate_unique(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        if (!result) {
                security_d_instantiate(entry, inode);
@@ -1126,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
                res = d_alloc(NULL, &name);
                if (res) {
                        res->d_sb = root_inode->i_sb;
+                        d_set_d_op(res, res->d_sb->s_d_op);
                        res->d_parent = res;
                        d_instantiate(res, root_inode);
                }
@@ -1134,14 +1523,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
-static inline struct hlist_head *d_hash(struct dentry *parent,
-                                        unsigned long hash)
-{
-        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
-        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
-        return dentry_hashtable + (hash & D_HASHMASK);
-}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1182,10 +1563,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
        tmp->d_parent = tmp; /* make sure dput doesn't croak */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        res = __d_find_alias(inode, 0);
        if (res) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                dput(tmp);
                goto out_iput;
        }
@@ -1193,14 +1575,17 @@ struct dentry *d_obtain_alias(struct inode *inode)
        /* attach a disconnected dentry */
        spin_lock(&tmp->d_lock);
        tmp->d_sb = inode->i_sb;
+        d_set_d_op(tmp, tmp->d_sb->s_d_op);
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
-        tmp->d_flags &= ~DCACHE_UNHASHED;
        list_add(&tmp->d_alias, &inode->i_dentry);
-        hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+        bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        tmp->d_flags &= ~DCACHE_UNHASHED;
+        hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
+        spin_unlock(&inode->i_lock);
-        spin_unlock(&dcache_lock);
        return tmp;
 out_iput:
@@ -1230,18 +1615,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        struct dentry *new = NULL;
        if (inode && S_ISDIR(inode->i_mode)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                new = __d_find_alias(inode, 1);
                if (new) {
                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        d_move(new, dentry);
                        iput(inode);
                } else {
-                        /* already taking dcache_lock, so d_add() by hand */
+                        /* already taking inode->i_lock, so d_add() by hand */
                        __d_instantiate(dentry, inode);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
@@ -1314,10 +1699,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * Negative dentry: instantiate it unless the inode is a directory and
         * already has a dentry.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
                __d_instantiate(found, inode);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                security_d_instantiate(found, inode);
                return found;
        }
@@ -1327,8 +1712,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * reference to it, move it in place and use it.
         */
        new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        dget_locked(new);
+        __dget(new);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        security_d_instantiate(found, inode);
        d_move(new, found);
        iput(inode);
@@ -1342,6 +1727,112 @@ err_out:
 EXPORT_SYMBOL(d_add_ci);
 /**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+                                unsigned *seq, struct inode **inode)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
+        struct dentry *dentry;
+        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Carefully use d_seq when comparing a candidate dentry, to avoid
+         * races with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+                struct inode *i;
+                const char *tname;
+                int tlen;
+                if (dentry->d_name.hash != hash)
+                        continue;
+seqretry:
+                *seq = read_seqcount_begin(&dentry->d_seq);
+                if (dentry->d_parent != parent)
+                        continue;
+                if (d_unhashed(dentry))
+                        continue;
+                tlen = dentry->d_name.len;
+                tname = dentry->d_name.name;
+                i = dentry->d_inode;
+                prefetch(tname);
+                if (i)
+                        prefetch(i);
+                /*
+                 * This seqcount check is required to ensure name and
+                 * len are loaded atomically, so as not to walk off the
+                 * edge of memory when walking. If we could load this
+                 * atomically some other way, we could drop this check.
+                 */
+                if (read_seqcount_retry(&dentry->d_seq, *seq))
+                        goto seqretry;
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, *inode,
+                                                dentry, i,
+                                                tlen, tname, name))
+                                continue;
+                } else {
+                        if (dentry_cmp(tname, tlen, str, len))
+                                continue;
+                }
+                /*
+                 * No extra seqcount check is required after the name
+                 * compare. The caller must perform a seqcount check in
+                 * order to do anything useful with the returned dentry
+                 * anyway.
+                 */
+                *inode = i;
+                return dentry;
+        }
+        return NULL;
+}
+/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1352,10 +1843,10 @@ EXPORT_SYMBOL(d_add_ci);
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-        struct dentry * dentry = NULL;
+        struct dentry *dentry;
-        unsigned long seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&rename_lock);
@@ -1367,7 +1858,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
-/*
+/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1382,17 +1873,24 @@ EXPORT_SYMBOL(d_lookup);
 *
 * __d_lookup callers must be commented.
 */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct hlist_head *head = d_hash(parent,hash);
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
        struct dentry *found = NULL;
-        struct hlist_node *node;
        struct dentry *dentry;
        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1407,25 +1905,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
         */
        rcu_read_lock();
        
-        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
-                struct qstr *qstr;
+                const char *tname;
+                int tlen;
                if (dentry->d_name.hash != hash)
                        continue;
-                if (dentry->d_parent != parent)
-                        continue;
                spin_lock(&dentry->d_lock);
-                /*
-                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things. Don't bother checking the hash because
-                 * we're about to compare the whole name anyway.
-                 */
                if (dentry->d_parent != parent)
                        goto next;
-                /* non-existing due to RCU? */
                if (d_unhashed(dentry))
                        goto next;
@@ -1433,18 +1922,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
                 */
-                qstr = &dentry->d_name;
+                tlen = dentry->d_name.len;
-                if (parent->d_op && parent->d_op->d_compare) {
+                tname = dentry->d_name.name;
-                        if (parent->d_op->d_compare(parent, qstr, name))
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, parent->d_inode,
+                                                dentry, dentry->d_inode,
+                                                tlen, tname, name))
                                goto next;
                } else {
-                        if (qstr->len != len)
+                        if (dentry_cmp(tname, tlen, str, len))
-                                goto next;
-                        if (memcmp(qstr->name, str, len))
                                goto next;
                }
-                atomic_inc(&dentry->d_count);
+                dentry->d_count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
@@ -1473,8 +1963,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(name->name, name->len);
-        if (dir->d_op && dir->d_op->d_hash) {
+        if (dir->d_flags & DCACHE_OP_HASH) {
-                if (dir->d_op->d_hash(dir, name) < 0)
+                if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
                        goto out;
        }
        dentry = d_lookup(dir, name);
@@ -1483,34 +1973,32 @@ out:
 }
 /**
- * d_validate - verify dentry provided from insecure source
+ * d_validate - verify dentry provided from insecure source (deprecated)
 * @dentry: The dentry alleged to be valid child of @dparent
 * @dparent: The parent dentry (known to be valid)
 *
 * An insecure source has sent us a dentry, here we verify it and dget() it.
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
 */
-int d_validate(struct dentry *dentry, struct dentry *parent)
+int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+        struct dentry *child;
-        struct hlist_node *node;
-        struct dentry *d;
-        /* Check whether the ptr might be valid at all.. */
+        spin_lock(&dparent->d_lock);
-        if (!kmem_ptr_validate(dentry_cache, dentry))
+        list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
-                return 0;
+                if (dentry == child) {
-        if (dentry->d_parent != parent)
+                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                return 0;
+                        __dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
-        rcu_read_lock();
+                        spin_unlock(&dparent->d_lock);
-        hlist_for_each_entry_rcu(d, node, head, d_hash) {
-                if (d == dentry) {
-                        dget(dentry);
                        return 1;
                }
        }
-        rcu_read_unlock();
+        spin_unlock(&dparent->d_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1538,16 +2026,23 @@ EXPORT_SYMBOL(d_validate);
 
 void d_delete(struct dentry * dentry)
 {
+        struct inode *inode;
        int isdir = 0;
        /*
         * Are we the only user?
         */
-        spin_lock(&dcache_lock);
+again:
        spin_lock(&dentry->d_lock);
-        isdir = S_ISDIR(dentry->d_inode->i_mode);
+        inode = dentry->d_inode;
-        if (atomic_read(&dentry->d_count) == 1) {
+        isdir = S_ISDIR(inode->i_mode);
+        if (dentry->d_count == 1) {
+                if (inode && !spin_trylock(&inode->i_lock)) {
+                        spin_unlock(&dentry->d_lock);
+                        cpu_relax();
+                        goto again;
+                }
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-                dentry_iput(dentry);
+                dentry_unlink_inode(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
        }
@@ -1556,17 +2051,18 @@ void d_delete(struct dentry * dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
 {
+        BUG_ON(!d_unhashed(entry));
+        spin_lock_bucket(b);
        entry->d_flags &= ~DCACHE_UNHASHED;
-        hlist_add_head_rcu(&entry->d_hash, list);
+        hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+        spin_unlock_bucket(b);
 }
 static void _d_rehash(struct dentry * entry)
@@ -1583,25 +2079,39 @@ static void _d_rehash(struct dentry * entry)
 
 void d_rehash(struct dentry * entry)
 {
-        spin_lock(&dcache_lock);
        spin_lock(&entry->d_lock);
        _d_rehash(entry);
        spin_unlock(&entry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
-/*
+/**
- * When switching names, the actual string doesn't strictly have to
+ * dentry_update_name_case - update case insensitive dentry with a new name
- * be preserved in the target - because we're dropping the target
+ * @dentry: dentry to be updated
- * anyway. As such, we can just do a simple memcpy() to copy over
+ * @name: new name
- * the new name before we switch.
 *
- * Note that we have to be a lot more careful about getting the hash
+ * Update a case insensitive dentry with new case of name.
- * switched - we have to switch the hash value properly even if it
+ *
- * then no longer matches the actual (corrupted) string of the target.
+ * dentry must have been returned by d_lookup with name @name. Old and new
- * The hash value has to match the hash queue that the dentry is on..
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
 */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+        spin_lock(&dentry->d_lock);
+        write_seqcount_begin(&dentry->d_seq);
+        memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+        write_seqcount_end(&dentry->d_seq);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
 static void switch_names(struct dentry *dentry, struct dentry *target)
 {
        if (dname_external(target)) {
@@ -1643,54 +2153,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
        swap(dentry->d_name.len, target->d_name.len);
 }
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+        /*
+         * XXXX: do we really need to take target->d_lock?
+         */
+        if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+                spin_lock(&target->d_parent->d_lock);
+        else {
+                if (d_ancestor(dentry->d_parent, target->d_parent)) {
+                        spin_lock(&dentry->d_parent->d_lock);
+                        spin_lock_nested(&target->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                } else {
+                        spin_lock(&target->d_parent->d_lock);
+                        spin_lock_nested(&dentry->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                }
+        }
+        if (target < dentry) {
+                spin_lock_nested(&target->d_lock, 2);
+                spin_lock_nested(&dentry->d_lock, 3);
+        } else {
+                spin_lock_nested(&dentry->d_lock, 2);
+                spin_lock_nested(&target->d_lock, 3);
+        }
+}
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+                                        struct dentry *target)
+{
+        if (target->d_parent != dentry->d_parent)
+                spin_unlock(&dentry->d_parent->d_lock);
+        if (target->d_parent != target)
+                spin_unlock(&target->d_parent->d_lock);
+}
 /*
- * We cannibalize "target" when moving dentry on top of it,
+ * When switching names, the actual string doesn't strictly have to
- * because it's going to be thrown away anyway. We could be more
+ * be preserved in the target - because we're dropping the target
- * polite about it, though.
+ * anyway. As such, we can just do a simple memcpy() to copy over
- *
+ * the new name before we switch.
- * This forceful removal will result in ugly /proc output if
+ *
- * somebody holds a file open that got deleted due to a rename.
+ * Note that we have to be a lot more careful about getting the hash
- * We could be nicer about the deleted file, and let it show
+ * switched - we have to switch the hash value properly even if it
- * up under the name it had before it was deleted rather than
+ * then no longer matches the actual (corrupted) string of the target.
- * under the original name of the file that was moved on top of it.
+ * The hash value has to match the hash queue that the dentry is on..
 */
- 
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way.
 */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
-        struct hlist_head *list;
        if (!dentry->d_inode)
                printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+        BUG_ON(d_ancestor(dentry, target));
+        BUG_ON(d_ancestor(target, dentry));
        write_seqlock(&rename_lock);
-        /*
-         * XXXX: do we really need to take target->d_lock?
-         */
-        if (target < dentry) {
-                spin_lock(&target->d_lock);
-                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-        } else {
-                spin_lock(&dentry->d_lock);
-                spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-        }
-        /* Move the dentry to the target hash queue, if on different bucket */
+        dentry_lock_for_move(dentry, target);
-        if (d_unhashed(dentry))
-                goto already_unhashed;
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&target->d_seq);
-        hlist_del_rcu(&dentry->d_hash);
+        /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-already_unhashed:
+        /*
-        list = d_hash(target->d_parent, target->d_name.hash);
+         * Move the dentry to the target hash queue. Don't bother checking
-        __d_rehash(dentry, list);
+         * for the same hash queue because of how unlikely it is.
+         */
+        __d_drop(dentry);
+        __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
        /* Unhash the target: dput() will then get rid of it */
        __d_drop(target);
@@ -1715,27 +2255,16 @@ already_unhashed:
        }
        list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+        write_seqcount_end(&target->d_seq);
+        write_seqcount_end(&dentry->d_seq);
+        dentry_unlock_parents_for_move(dentry, target);
        spin_unlock(&target->d_lock);
        fsnotify_d_move(dentry);
        spin_unlock(&dentry->d_lock);
        write_sequnlock(&rename_lock);
 }
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-        spin_lock(&dcache_lock);
-        d_move_locked(dentry, target);
-        spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 /**
@@ -1761,13 +2290,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
-static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+static struct dentry *__d_unalias(struct inode *inode,
-        __releases(dcache_lock)
+                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
        struct dentry *ret;
@@ -1790,10 +2319,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-        d_move_locked(alias, dentry);
+        d_move(alias, dentry);
        ret = alias;
 out_err:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (m2)
                mutex_unlock(m2);
        if (m1)
@@ -1804,17 +2333,23 @@ out_err:
 /*
 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
 * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
 */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
        struct dentry *dparent, *aparent;
-        switch_names(dentry, anon);
+        dentry_lock_for_move(anon, dentry);
-        swap(dentry->d_name.hash, anon->d_name.hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&anon->d_seq);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
+        switch_names(dentry, anon);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dentry->d_parent = (aparent == anon) ? dentry : aparent;
        list_del(&dentry->d_u.d_child);
        if (!IS_ROOT(dentry))
@@ -1829,6 +2364,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        else
                INIT_LIST_HEAD(&anon->d_u.d_child);
+        write_seqcount_end(&dentry->d_seq);
+        write_seqcount_end(&anon->d_seq);
+        dentry_unlock_parents_for_move(anon, dentry);
+        spin_unlock(&dentry->d_lock);
+        /* anon->d_lock still locked, returns locked */
        anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
@@ -1846,14 +2388,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        BUG_ON(!d_unhashed(dentry));
-        spin_lock(&dcache_lock);
        if (!inode) {
                actual = dentry;
                __d_instantiate(dentry, NULL);
-                goto found_lock;
+                d_rehash(actual);
+                goto out_nolock;
        }
+        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -1864,13 +2407,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        /* Is this an anonymous mountpoint that we could splice
                         * into our tree? */
                        if (IS_ROOT(alias)) {
-                                spin_lock(&alias->d_lock);
                                __d_materialise_dentry(dentry, alias);
                                __d_drop(alias);
                                goto found;
                        }
                        /* Nope, but we must(!) avoid directory aliasing */
-                        actual = __d_unalias(dentry, alias);
+                        actual = __d_unalias(inode, dentry, alias);
                        if (IS_ERR(actual))
                                dput(alias);
                        goto out_nolock;
@@ -1881,15 +2423,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        actual = __d_instantiate_unique(dentry, inode);
        if (!actual)
                actual = dentry;
-        else if (unlikely(!d_unhashed(actual)))
+        else
-                goto shouldnt_be_hashed;
+                BUG_ON(!d_unhashed(actual));
-found_lock:
        spin_lock(&actual->d_lock);
 found:
        _d_rehash(actual);
        spin_unlock(&actual->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 out_nolock:
        if (actual == dentry) {
                security_d_instantiate(dentry, inode);
@@ -1898,10 +2439,6 @@ out_nolock:
        iput(inode);
        return actual;
-shouldnt_be_hashed:
-        spin_unlock(&dcache_lock);
-        BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -1921,14 +2458,13 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 /**
- * Prepend path string to a buffer
+ * prepend_path - Prepend path string to a buffer
- *
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -1956,7 +2492,9 @@ static int prepend_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
+                spin_lock(&dentry->d_lock);
                error = prepend_name(buffer, buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
                if (!error)
                        error = prepend(buffer, buflen, "/", 1);
                if (error)
@@ -2012,9 +2550,9 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (error)
                return ERR_PTR(error);
@@ -2076,12 +2614,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (error)
                res = ERR_PTR(error);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        return res;
 }
@@ -2107,12 +2645,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        if (error)
                res =  ERR_PTR(error);
@@ -2144,7 +2682,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *end = buf + buflen;
        char *retval;
@@ -2158,10 +2696,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                int error;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                spin_lock(&dentry->d_lock);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                error = prepend_name(&end, &buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
+                if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
                        goto Elong;
                retval = end;
@@ -2171,14 +2712,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
-EXPORT_SYMBOL(__dentry_path);
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+        char *retval;
+        write_seqlock(&rename_lock);
+        retval = __dentry_path(dentry, buf, buflen);
+        write_sequnlock(&rename_lock);
+        return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
 char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *p = NULL;
        char *retval;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2186,12 +2738,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
 Elong:
-        spin_unlock(&dcache_lock);
        return ERR_PTR(-ENAMETOOLONG);
 }
@@ -2225,7 +2776,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        get_fs_root_and_pwd(current->fs, &root, &pwd);
        error = -ENOENT;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
@@ -2234,7 +2785,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
                if (error)
                        goto out;
@@ -2253,8 +2804,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
-        } else
+        } else {
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
+        }
 out:
        path_put(&pwd);
@@ -2282,25 +2834,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
        int result;
-        unsigned long seq;
+        unsigned seq;
        if (new_dentry == old_dentry)
                return 1;
-        /*
-         * Need rcu_readlock to protect against the d_parent trashing
-         * due to d_move
-         */
-        rcu_read_lock();
        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
+                /*
+                 * Need rcu_readlock to protect against the d_parent trashing
+                 * due to d_move
+                 */
+                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = 1;
                else
                        result = 0;
+                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));
-        rcu_read_unlock();
        return result;
 }
@@ -2332,10 +2884,15 @@ EXPORT_SYMBOL(path_is_under);
 void d_genocide(struct dentry *root)
 {
-        struct dentry *this_parent = root;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = root;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -2343,21 +2900,62 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                if (d_unhashed(dentry)||!dentry->d_inode)
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                if (d_unhashed(dentry) || !dentry->d_inode) {
+                        spin_unlock(&dentry->d_lock);
                        continue;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
-                atomic_dec(&dentry->d_count);
+                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+                        dentry->d_flags |= DCACHE_GENOCIDE;
+                        dentry->d_count--;
+                }
+                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                atomic_dec(&this_parent->d_count);
+                struct dentry *child;
-                this_parent = this_parent->d_parent;
+                tmp = this_parent->d_parent;
+                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+                        this_parent->d_flags |= DCACHE_GENOCIDE;
+                        this_parent->d_count--;
+                }
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
+        return;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -2411,7 +3009,7 @@ static void __init dcache_init_early(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY,
@@ -2420,16 +3018,13 @@ static void __init dcache_init_early(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 static void __init dcache_init(void)
 {
        int loop;
-        percpu_counter_init(&nr_dentry, 0);
-        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
@@ -2446,7 +3041,7 @@ static void __init dcache_init(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        0,
@@ -2455,7 +3050,7 @@ static void __init dcache_init(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 /* SLAB cache for __getname() consumers */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f..b044705eedd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
-static int
+static void
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
 {
        struct bio *bio;
+        /*
+         * bio_alloc() is guaranteed to return a bio when called with
+         * __GFP_WAIT and we request a valid number of vectors.
+         */
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
        bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        dio->bio = bio;
        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
-        return 0;
 }
 /*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
                goto out;
        sector = start_sector << (dio->blkbits - 9);
        nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+        nr_pages = min(nr_pages, BIO_MAX_PAGES);
        BUG_ON(nr_pages <= 0);
-        ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+        dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
        dio->boundary = 0;
 out:
        return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e811..1897eb1b4b6 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
 menuconfig DLM
        tristate "Distributed Lock Manager (DLM)"
        depends on EXPERIMENTAL && INET
-        depends on SYSFS && (IPV6 || IPV6=n)
+        depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
-        select CONFIGFS_FS
        select IP_SCTP
        help
        A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622..9c64ae9e4c1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
 struct cbuf {
        unsigned int base;
        unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!con)
+                return;
+        clear_bit(SOCK_NOSPACE, &con->sock->flags);
+        if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+                con->sock->sk->sk_write_pending--;
+                clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+        }
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock = NULL;
+        int one = 1;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
                                   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
                goto create_out;
        }
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                                   (char *)&one, sizeof(one));
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
+        int count = 0;
        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
                        ret = kernel_sendpage(con->sock, e->page, offset, len,
                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
+                                if (ret == -EAGAIN &&
+                                    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+                                    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+                                        /* Notify TCP that we're limited by the
+                                         * application window size.
+                                         */
+                                        set_bit(SOCK_NOSPACE, &con->sock->flags);
+                                        con->sock->sk->sk_write_pending++;
+                                }
                                cond_resched();
                                goto out;
                        }
                        if (ret <= 0)
                                goto send_error;
                }
-                        /* Don't starve people filling buffers */
+                /* Don't starve people filling buffers */
+                if (++count >= MAX_SEND_MSG_COUNT) {
                        cond_resched();
+                        count = 0;
+                }
                spin_lock(&con->writequeue_lock);
                e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 static int work_start(void)
 {
-        int error;
+        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-        recv_workqueue = create_workqueue("dlm_recv");
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        error = IS_ERR(recv_workqueue);
+        if (!recv_workqueue) {
-        if (error) {
+                log_print("can't start dlm_recv");
-                log_print("can't start dlm_recv %d", error);
+                return -ENOMEM;
-                return error;
        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
+        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-        error = IS_ERR(send_workqueue);
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        if (error) {
+        if (!send_workqueue) {
-                log_print("can't start dlm_send %d", error);
+                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
-                return error;
+                return -ENOMEM;
        }
        return 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e..bfd8b680e64 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
        BUG_ON(!crypt_stat || !crypt_stat->tfm
               || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+                ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "encryption:\n");
                ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "decryption:\n");
                ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
        }
        ecryptfs_printk(KERN_DEBUG,
                        "Initializing cipher [%s]; strlen = [%d]; "
-                        "key_size_bits = [%d]\n",
+                        "key_size_bits = [%zd]\n",
                        crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
                        crypt_stat->key_size << 3);
        if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f7..6fc4f319b55 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
 */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct dentry *lower_dentry;
-        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct vfsmount *lower_mnt;
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
        int rc = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
        dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0b..dbc84ed9633 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
                (((struct user_key_payload*)key->payload.data)->data);
 }
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 #define ecryptfs_printk(type, fmt, arg...) \
        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
 void __ecryptfs_printk(const char *fmt, ...);
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bf..81e10e6a944 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
                                const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
 {
-        int rc;
+        ssize_t rc;
        struct dentry *lower_dentry;
        struct vfsmount *lower_vfsmount;
        struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        if (!ecryptfs_inode_to_private(inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free;
-                        goto out_free;
-                }
        }
-        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
-            && !(file->f_flags & O_RDONLY)) {
+            == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
                rc = -EPERM;
                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
                       "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
-                        "size: [0x%.16x]\n", inode, inode->i_ino,
+                        "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
-                        i_size_read(inode));
+                        (unsigned long long)i_size_read(inode));
        goto out;
 out_free:
        kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d6276..bd33f87a190 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out;
-                        goto out;
-                }
        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry);
        if (rc) {
@@ -260,7 +258,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                   ecryptfs_dentry->d_parent));
        lower_inode = lower_dentry->d_inode;
        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
-        BUG_ON(!atomic_read(&lower_dentry->d_count));
+        BUG_ON(!lower_dentry->d_count);
        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                rc = -ENOMEM;
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free_kmem;
-                        goto out_free_kmem;
-                }
        }
        crypt_stat = &ecryptfs_inode_to_private(
                                        ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct qstr lower_name;
        int rc = 0;
-        ecryptfs_dentry->d_op = &ecryptfs_dops;
        if ((ecryptfs_dentry->d_name.len == 1
             && !strcmp(ecryptfs_dentry->d_name.name, "."))
            || (ecryptfs_dentry->d_name.len == 2
@@ -454,7 +449,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = ecryptfs_dentry->d_name.hash;
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -489,7 +484,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -980,8 +975,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a522..c1436cff6f2 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
                break;
        default:
                ecryptfs_printk(KERN_WARNING, "Unknown error code: "
-                                "[0x%.16x]\n", err_code);
+                                "[0x%.16lx]\n", err_code);
                rc = -EINVAL;
        }
        return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
        } else {
                rc = -EINVAL;
                ecryptfs_printk(KERN_WARNING,
-                                "Unsupported packet size: [%d]\n", size);
+                                "Unsupported packet size: [%zd]\n", size);
        }
        return rc;
 }
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
               auth_tok->session_key.decrypted_key_size);
        crypt_stat->flags |= ECRYPTFS_KEY_VALID;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
                                ecryptfs_printk(KERN_ERR, "Expected "
                                                "signature of size [%d]; "
-                                                "read size [%d]\n",
+                                                "read size [%zd]\n",
                                                ECRYPTFS_SIG_SIZE,
                                                tag_11_contents_size);
                                rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        goto out_wipe_list;
                        break;
                default:
-                        ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+                        ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
-                                        "[%d] of the file header; hex value of "
+                                        "of the file header; hex value of "
                                        "character is [0x%.2x]\n", i, src[i]);
                        next_packet_is_auth_tok_packet = 0;
                }
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
                                "session key for authentication token with sig "
                                "[%.*s]; rc = [%d]. Removing auth tok "
                                "candidate from the list and searching for "
-                                "the next match.\n", candidate_auth_tok_sig,
+                                "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
-                                ECRYPTFS_SIG_SIZE_HEX, rc);
+                                candidate_auth_tok_sig, rc);
                list_for_each_entry_safe(auth_tok_list_item,
                                         auth_tok_list_item_tmp,
                                         &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (encrypted_session_key_valid) {
                ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
                                "using auth_tok->session_key.encrypted_key, "
-                                "where key_rec->enc_key_size = [%d]\n",
+                                "where key_rec->enc_key_size = [%zd]\n",
                                key_rec->enc_key_size);
                memcpy(key_rec->enc_key,
                       auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (rc < 1 || rc > 2) {
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat session key; expected rc = 1; "
-                                "got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+                                "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
                                rc, key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat encrypted session key; "
                                "expected rc = 1; got rc = [%d]. "
-                                "key_rec->enc_key_size = [%d]\n", rc,
+                                "key_rec->enc_key_size = [%zd]\n", rc,
                                key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        rc = 0;
-        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+        ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
                        crypt_stat->key_size);
        rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
                                      (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        }
        ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
        if (ecryptfs_verbosity > 0) {
-                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
                                key_rec->enc_key_size);
                ecryptfs_dump_hex(key_rec->enc_key,
                                  key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a9dbd62518e..758323a0f09 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        return rc;
 }
-/**
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
- * ecryptfs_interpose
+                       struct super_block *sb)
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-                       struct super_block *sb, u32 flags)
 {
-        struct inode *lower_inode;
        struct inode *inode;
        int rc = 0;
-        lower_inode = lower_dentry->d_inode;
        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
                rc = -EXDEV;
                goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        if (special_file(lower_inode->i_mode))
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
-        dentry->d_op = &ecryptfs_dops;
        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
+        return inode;
+out:
+        return ERR_PTR(rc);
+}
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+                       struct super_block *sb, u32 flags)
+{
+        struct inode *lower_inode = lower_dentry->d_inode;
+        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
-out:
+        return 0;
-        return rc;
 }
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 /**
- * ecryptfs_read_super
- * @sb: The ecryptfs super block
- * @dev_name: The path to mount over
- *
- * Read the super block of the lower filesystem, and use
- * ecryptfs_interpose to create our initial inode and super block
- * struct.
- */
-static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
-{
-        struct path path;
-        int rc;
-        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-        if (rc) {
-                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
-                goto out;
-        }
-        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
-                rc = -EINVAL;
-                printk(KERN_ERR "Mount on filesystem of type "
-                        "eCryptfs explicitly disallowed due to "
-                        "known incompatibilities\n");
-                goto out_free;
-        }
-        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
-        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
-        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
-        ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
-        ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
-        rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
-        if (rc)
-                goto out_free;
-        rc = 0;
-        goto out;
-out_free:
-        path_put(&path);
-out:
-        return rc;
-}
-/**
 * ecryptfs_get_sb
 * @fs_type
 * @flags
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
- *
- * The whole ecryptfs_get_sb process is broken into 3 functions:
- * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
 static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        struct ecryptfs_sb_info *sbi;
        struct ecryptfs_dentry_info *root_info;
        const char *err = "Getting sb failed";
+        struct inode *inode;
+        struct path path;
        int rc;
        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        s->s_flags = flags;
        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-        if (rc) {
+        if (rc)
-                deactivate_locked_super(s);
+                goto out1;
-                goto out;
-        }
        ecryptfs_set_superblock_private(s, sbi);
        s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        /* ->kill_sb() will take care of sbi after that point */
        sbi = NULL;
        s->s_op = &ecryptfs_sops;
+        s->s_d_op = &ecryptfs_dops;
-        rc = -ENOMEM;
+        err = "Reading sb failed";
-        s->s_root = d_alloc(NULL, &(const struct qstr) {
+        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-                             .hash = 0,.name = "/",.len = 1});
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+                goto out1;
+        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
+        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+        s->s_blocksize = path.dentry->d_sb->s_blocksize;
+        s->s_magic = ECRYPTFS_SUPER_MAGIC;
+        inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+        rc = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_free;
+        s->s_root = d_alloc_root(inode);
        if (!s->s_root) {
-                deactivate_locked_super(s);
+                iput(inode);
-                goto out;
+                rc = -ENOMEM;
+                goto out_free;
        }
-        s->s_root->d_op = &ecryptfs_dops;
-        s->s_root->d_sb = s;
-        s->s_root->d_parent = s->s_root;
+        rc = -ENOMEM;
        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-        if (!root_info) {
+        if (!root_info)
-                deactivate_locked_super(s);
+                goto out_free;
-                goto out;
-        }
        /* ->kill_sb() will take care of root_info */
        ecryptfs_set_dentry_private(s->s_root, root_info);
+        ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+        ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
        s->s_flags |= MS_ACTIVE;
-        rc = ecryptfs_read_super(s, dev_name);
-        if (rc) {
-                deactivate_locked_super(s);
-                err = "Reading sb failed";
-                goto out;
-        }
        return dget(s->s_root);
+out_free:
+        path_put(&path);
+out1:
+        deactivate_locked_super(s);
 out:
        if (sbi) {
                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
                ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
                                "larger than the host's page size, and so "
                                "eCryptfs cannot run on this system. The "
-                                "default eCryptfs extent size is [%d] bytes; "
+                                "default eCryptfs extent size is [%u] bytes; "
-                                "the page size is [%d] bytes.\n",
+                                "the page size is [%lu] bytes.\n",
-                                ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+                                ECRYPTFS_DEFAULT_EXTENT_SIZE,
+                                (unsigned long)PAGE_CACHE_SIZE);
                goto out;
        }
        rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544..cc64fca89f8 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting "
-                                "page (upper index [0x%.16x])\n", page->index);
+                                "page (upper index [0x%.16lx])\n", page->index);
                ClearPageUptodate(page);
                goto out;
        }
@@ -237,7 +237,7 @@ out:
                ClearPageUptodate(page);
        else
                SetPageUptodate(page);
-        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
                        page->index);
        unlock_page(page);
        return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
                return -ENOMEM;
        *pagep = page;
+        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
                                SetPageUptodate(page);
                        }
                } else {
-                        rc = ecryptfs_decrypt_page(page);
+                        if (prev_page_end_size
-                        if (rc) {
+                            >= i_size_read(page->mapping->host)) {
-                                printk(KERN_ERR "%s: Error decrypting page "
+                                zero_user(page, 0, PAGE_CACHE_SIZE);
-                                       "at index [%ld]; rc = [%d]\n",
+                        } else {
-                                       __func__, page->index, rc);
+                                rc = ecryptfs_decrypt_page(page);
-                                ClearPageUptodate(page);
+                                if (rc) {
-                                goto out;
+                                        printk(KERN_ERR "%s: Error decrypting "
+                                               "page at index [%ld]; "
+                                               "rc = [%d]\n",
+                                               __func__, page->index, rc);
+                                        ClearPageUptodate(page);
+                                        goto out;
+                                }
                        }
                        SetPageUptodate(page);
                }
        }
-        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        /* If creating a page or more of holes, zero them out via truncate.
         * Note, this will increase i_size. */
        if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
        } else
                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-                        "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+                        "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
                                                       to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
        rc = fill_zeros_to_end_of_page(page, to);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-                        "zeros in page with index = [0x%.16x]\n", index);
+                        "zeros in page with index = [0x%.16lx]\n", index);
                goto out;
        }
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-                                "index [0x%.16x])\n", index);
+                                "index [0x%.16lx])\n", index);
                goto out;
        }
        if (pos + copied > i_size_read(ecryptfs_inode)) {
                i_size_write(ecryptfs_inode, pos + copied);
                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
-                                "[0x%.16x]\n", i_size_read(ecryptfs_inode));
+                        "[0x%.16llx]\n",
+                        (unsigned long long)i_size_read(ecryptfs_inode));
        }
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        if (rc)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d3..3042fe123a3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/key.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -63,6 +62,16 @@ out:
        return inode;
 }
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
 /**
 * ecryptfs_destroy_inode
 * @inode: The ecryptfs inode
@@ -89,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+        call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652c..0f31acb0131 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
+static void efs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, efs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067..cc8a9b7d606 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
 * Configuration options available inside /proc/sys/fs/epoll/
 */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 /*
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 #include <linux/sysctl.h>
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 ctl_table epoll_table[] = {
        {
                .procname       = "max_user_watches",
                .data           = &max_user_watches,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(max_user_watches),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = &zero,
+                .extra2         = &long_max,
        },
        { }
 };
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        /* At this point it is safe to free the eventpoll item */
        kmem_cache_free(epi_cache, epi);
-        atomic_dec(&ep->user->epoll_watches);
+        atomic_long_dec(&ep->user->epoll_watches);
        return 0;
 }
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
        int error, revents, pwake = 0;
        unsigned long flags;
+        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
-        if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+        user_watches = atomic_long_read(&ep->user->epoll_watches);
-                     max_user_watches))
+        if (unlikely(user_watches >= max_user_watches))
                return -ENOSPC;
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
                return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        spin_unlock_irqrestore(&ep->lock, flags);
-        atomic_inc(&ep->user->epoll_watches);
+        atomic_long_inc(&ep->user->epoll_watches);
        /* We have to call this outside the lock */
        if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
+        BUG_ON(max_user_watches < 0);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e..c62efcb959c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
 #ifdef CONFIG_MMU
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+        struct mm_struct *mm = current->mm;
+        long diff = (long)(pages - bprm->vma_pages);
+        if (!mm || !diff)
+                return;
+        bprm->vma_pages = pages;
+#ifdef SPLIT_RSS_COUNTING
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+        spin_lock(&mm->page_table_lock);
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+        spin_unlock(&mm->page_table_lock);
+#endif
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
                struct rlimit *rlim;
+                acct_arg_size(bprm, size / PAGE_SIZE);
                /*
                 * We've historically supported up to 32 pages (ARG_MAX)
                 * of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+        err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+        if (err)
+                goto err;
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 #else
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        /*
         * Release all of the old mmap stuff
         */
+        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
-                mmput (bprm->mm);
+                acct_arg_size(bprm, 0);
+                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e045..8c6c4669b38 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
+static void exofs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
 /*
 * Remove an inode from the cache
 */
 static void exofs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+        call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 /*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51b304056f1..4b6825740dd 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
                void *context)
 {
        struct dentry *dentry, *toput = NULL;
+        struct inode *inode;
        if (acceptable(context, result))
                return result;
-        spin_lock(&dcache_lock);
+        inode = result->d_inode;
-        list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
+        spin_lock(&inode->i_lock);
-                dget_locked(dentry);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                spin_unlock(&dcache_lock);
+                dget(dentry);
+                spin_unlock(&inode->i_lock);
                if (toput)
                        dput(toput);
                if (dentry != result && acceptable(context, dentry)) {
                        dput(result);
                        return dentry;
                }
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                toput = dentry;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (toput)
                dput(toput);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bad..7b4180554a6 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac4..c939b7b1209 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206a..47cda410b54 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 typedef struct ext2_dir_entry_2 ext2_dirent;
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == EXT2_MAX_REC_LEN)
                return 1 << 16;
+#endif
        return len;
 }
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == (1 << 16))
                return cpu_to_le16(EXT2_MAX_REC_LEN);
        else
                BUG_ON(len > (1 << 16));
+#endif
        return cpu_to_le16(len);
 }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
                p = (ext2_dirent *)(kaddr + offs);
                rec_len = ext2_rec_len_from_disk(p->rec_len);
-                if (rec_len < EXT2_DIR_REC_LEN(1))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
                        goto Eshort;
-                if (rec_len & 3)
+                if (unlikely(rec_len & 3))
                        goto Ealign;
-                if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
                        goto Enamelen;
-                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
                        goto Espan;
-                if (le32_to_cpu(p->inode) > max_inumber)
+                if (unlikely(le32_to_cpu(p->inode) > max_inumber))
                        goto Einumber;
        }
        if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e329..2e1d8341d82 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext2_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d7..7731695e65d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
-void ext2_error (struct super_block * sb, const char * function,
+void ext2_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
+static void ext2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ext2_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be327..c2e4dce984d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        /* Check the remaining name entries */
-        while (!IS_LAST_ENTRY(entry)) {
-                struct ext2_xattr_entry *next =
-                        EXT2_XATTR_NEXT(entry);
-                if ((char *)next >= end)
-                        goto bad_block;
-                entry = next;
-        }
        if (ext2_xattr_cache_insert(bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
 * ext2_xattr_set()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe21218..e4fa49e6c53 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de..5faf8048e90 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db2264942..045995c8ce5 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+        ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        if (offsetp)
+                *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+        if (blockgrpp)
+                *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
 /**
 * ext3_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
        return ext3_bg_num_gdb_meta(sb,group);
 }
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @group:              allocation group to trim
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @gdp:                allocation group description structure
+ * @minblocks:          minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                ext3_grpblk_t minblocks)
+{
+        handle_t *handle;
+        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+        ext3_fsblk_t discard_block;
+        struct ext3_sb_info *sbi;
+        struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+        struct ext3_group_desc *gdp;
+        int err = 0, ret = 0;
+        /*
+         * We will update one block bitmap, and one group descriptor
+         */
+        handle = ext3_journal_start_sb(sb, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (!bitmap_bh) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext3_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto err_out;
+        gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+        if (!gdp) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        err = ext3_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto err_out;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        sbi = EXT3_SB(sb);
+         /* Walk through the whole group */
+        while (start < max) {
+                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+                if (start < 0)
+                        break;
+                next = start;
+                /*
+                 * Allocate contiguous free extents by setting bits in the
+                 * block bitmap
+                 */
+                while (next < max
+                        && claim_block(sb_bgl_lock(sbi, group),
+                                        next, bitmap_bh)) {
+                        next++;
+                }
+                 /* We did not claim any blocks */
+                if (next == start)
+                        continue;
+                discard_block = (ext3_fsblk_t)start +
+                                ext3_group_first_block_no(sb, group);
+                /* Update counters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                /* Do not issue a TRIM on extents smaller than minblocks */
+                if ((next - start) < minblocks)
+                        goto free_extent;
+                 /* Send the TRIM command down to the device */
+                err = sb_issue_discard(sb, discard_block, next - start,
+                                       GFP_NOFS, 0);
+                count += (next - start);
+free_extent:
+                freed = 0;
+                /*
+                 * Clear bits in the bitmap
+                 */
+                for (bit = start; bit < next; bit++) {
+                        BUFFER_TRACE(bitmap_bh, "clear bit");
+                        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+                                                bit, bitmap_bh->b_data)) {
+                                ext3_error(sb, __func__,
+                                        "bit already cleared for block "E3FSBLK,
+                                         (unsigned long)bit);
+                                BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                        } else {
+                                freed++;
+                        }
+                }
+                /* Update couters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+                start = next;
+                if (err < 0) {
+                        if (err != -EOPNOTSUPP)
+                                ext3_warning(sb, __func__, "Discard command "
+                                             "returned error %d\n", err);
+                        break;
+                }
+                if (fatal_signal_pending(current)) {
+                        err = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+                /* No more suitable extents */
+                if ((free_blocks - count) < minblocks)
+                        break;
+        }
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+        if (!err)
+                err = ret;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+        ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+        if (!err)
+                err = ret;
+        ext3_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+err_out:
+        if (err)
+                count = err;
+        ext3_journal_stop(handle);
+        brelse(bitmap_bh);
+        return count;
+}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @start:              First Byte to trim
+ * @len:                number of Bytes to trim from start
+ * @minlen:             minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        ext3_grpblk_t last_block, first_block, free_blocks;
+        unsigned long first_group, last_group;
+        unsigned long group, ngroups;
+        struct ext3_group_desc *gdp;
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        uint64_t start, len, minlen, trimmed;
+        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start >= max_blks)
+                goto out;
+        if (start < le32_to_cpu(es->s_first_data_block)) {
+                len -= le32_to_cpu(es->s_first_data_block) - start;
+                start = le32_to_cpu(es->s_first_data_block);
+        }
+        if (start + len > max_blks)
+                len = max_blks - start;
+        ngroups = EXT3_SB(sb)->s_groups_count;
+        smp_rmb();
+        /* Determine first and last group to examine based on start and len */
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT3_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                gdp = ext3_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        break;
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                if (free_blocks < minlen)
+                        continue;
+                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                ret = ext3_trim_all_free(sb, group, first_block,
+                                        last_block, minlen);
+                if (ret < 0)
+                        break;
+                trimmed += ret;
+                first_block = 0;
+        }
+        if (ret >= 0)
+                ret = 0;
+out:
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf..34f0a072b93 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        const char * error_msg = NULL;
        const int rlen = ext3_rec_len_from_disk(de->rec_len);
-        if (rlen < EXT3_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                ext3_error (dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
                        (unsigned long) le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd..ae94f6d949f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        if (ext3_journal_dirty_metadata(handle, bh))
+                                return;
                }
                ext3_mark_inode_dirty(handle, inode);
                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext3_journal_get_write_access(handle, bh);
+                        if (ext3_journal_get_write_access(handle, bh))
+                                return;
                }
        }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783..fc080dd561f 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case FITRIM: {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                                   sizeof(range)))
+                        return -EFAULT;
+                ret = ext3_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                                 sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..b27ba71810e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
        unsigned long start, block, b;
+        const u8 *name = entry->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        namelen = entry->len;
        if (namelen > EXT3_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == 0)) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
                /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
                        struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
                        int *err)
 {
-        struct super_block * sb;
+        struct super_block *sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext3_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        unsigned long block;
        int retval;
-        int namelen = entry->len;
-        const u8 *name = entry->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-                if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
                if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
                        goto errout;
-                de = (struct ext3_dir_entry_2 *) bh->b_data;
-                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de)) {
-                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext3_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, entry,
-                                *res_dir = de;
+                                         block << EXT3_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {
-                        }
+                        dx_release(frames);
+                        return bh;
                }
-                brelse (bh);
+                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext3_htree_next_block(dir, hash, frame,
+                retval = ext3_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext3_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext3_journal_dirty_metadata(handle, frames[0].bh);
+                err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+                if (err)
+                        goto journal_error;
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
                if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
+                        int err;
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext3_journal_get_write_access(handle, bh);
+                        err = ext3_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto journal_error;
                        if (pde)
                                pde->rec_len = ext3_rec_len_to_disk(
                                        ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        err = ext3_journal_dirty_metadata(handle, bh);
+                        if (err) {
+journal_error:
+                                ext3_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
        handle_t *handle;
        struct inode * inode;
-        struct buffer_head * dir_block;
+        struct buffer_head * dir_block = NULL;
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
@@ -1790,15 +1793,14 @@ retry:
        inode->i_fop = &ext3_dir_operations;
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                drop_nlink(inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                unlock_new_inode(inode);
-                ext3_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext3_journal_get_write_access(handle, dir_block);
+        err = ext3_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-        ext3_journal_dirty_metadata(handle, dir_block);
+        err = ext3_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        if (err)
-        ext3_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext3_add_entry (handle, dentry, inode);
+        err = ext3_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext3_add_entry (handle, dentry, inode);
        if (err) {
+out_clear_inode:
                inode->i_nlink = 0;
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
        }
        inc_nlink(dir);
        ext3_update_dx_flag(dir);
-        ext3_mark_inode_dirty(handle, dir);
+        err = ext3_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext3_journal_get_write_access(handle, new_bh);
+                retval = ext3_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
                ext3_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, new_bh);
+                retval = ext3_journal_dirty_metadata(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        ext3_update_dx_flag(old_dir);
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext3_journal_get_write_access(handle, dir_bh);
+                retval = ext3_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto journal_error;
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, dir_bh);
+                retval = ext3_journal_dirty_metadata(handle, dir_bh);
+                if (retval) {
+journal_error:
+                        ext3_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b123..108b142e11e 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, it);
+                err = ext3_journal_dirty_metadata(handle, it);
+                if (err) {
+                        brelse(it);
+                        goto exit_bh;
+                }
                brelse(it);
                ext3_set_bit(bit, bh->b_data);
        }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
+        if (err)
+                goto exit_bh;
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
        brelse(bh);
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-        ext3_journal_dirty_metadata(handle, dind);
+        err = ext3_journal_dirty_metadata(handle, dind);
+        if (err)
+                goto exit_group_desc;
        brelse(dind);
+        dind = NULL;
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-        ext3_mark_iloc_dirty(handle, inode, &iloc);
+        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+        if (err)
+                goto exit_group_desc;
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext3_journal_dirty_metadata(handle, *primary);
+        err = ext3_journal_dirty_metadata(handle, *primary);
+        if (err)
+                goto exit_group_desc;
        o_group_desc = EXT3_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto exit_inode;
        return 0;
+exit_group_desc:
+        kfree(n_group_desc);
 exit_inode:
        //ext3_journal_release_buffer(handle, iloc.bh);
        brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
                }
                ext3_debug("update metadata backup %#04lx\n",
                          (unsigned long)bh->b_blocknr);
-                if ((err = ext3_journal_get_write_access(handle, bh)))
+                if ((err = ext3_journal_get_write_access(handle, bh))) {
+                        brelse(bh);
                        break;
+                }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext3_journal_dirty_metadata(handle, bh);
+                err = ext3_journal_dirty_metadata(handle, bh);
                brelse(bh);
+                if (err)
+                        break;
        }
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext3_journal_dirty_metadata(handle, primary);
+        err = ext3_journal_dirty_metadata(handle, primary);
+        if (err)
+                goto exit_journal;
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT3_INODES_PER_GROUP(sb));
-        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+        if (err) {
+                ext3_warning(sb, __func__,
+                             "error %d on journal dirty metadata", err);
+                ext3_journal_stop(handle);
+                goto exit_put;
+        }
        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b501..85c8cc8f247 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -144,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -196,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext3_error (struct super_block * sb, const char * function,
+void ext3_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        ext3_handle_error(sb);
@@ -275,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext3_abort (struct super_block * sb, const char * function,
+void ext3_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                 const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -301,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
-void ext3_warning (struct super_block * sb, const char * function,
+void ext3_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -347,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -364,8 +381,7 @@ fail:
 */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -480,6 +496,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
+static void ext3_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
 static void ext3_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -490,7 +513,7 @@ static void ext3_destroy_inode(struct inode *inode)
                                false);
                dump_stack();
        }
-        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+        call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 static void init_once(void *foo)
@@ -731,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1842,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
-                                      le32_to_cpu(es->s_blocks_count))) {
+                                        le32_to_cpu(es->s_blocks_count));
+        if (err) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
                        ext3_msg(sb, KERN_ERR,
                                "error: CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -2136,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext3_msg(sb, KERN_ERR,
-                        "error: failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -2291,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
        EXT3_SB(sb)->s_journal = journal;
        ext3_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -2859,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT3_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext3_msg(sb, KERN_WARNING,
                                "warning: Quota file not on filesystem root. "
                                "Journaled quota will not work.");
@@ -2889,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-        if (ext3_should_journal_data(path.dentry->d_inode)) {
+        if (ext3_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -2897,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                journal_lock_updates(EXT3_SB(sb)->s_journal);
                err = journal_flush(EXT3_SB(sb)->s_journal);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa8..32e6cc23bd9 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext3_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ea..e0270d1f8d8 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac..dec821168fd 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c67..adf96b82278 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40..164c56092e5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f8..0c8d97b56f3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -177,7 +177,7 @@ struct mpage_da_data {
 struct ext4_io_page {
        struct page     *p_page;
-        int             p_count;
+        atomic_t        p_count;
 };
 #define MAX_IO_PAGES 128
@@ -561,23 +561,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
+/* Max physical block we can address w/o extents */
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
-/* Max physical block we can addres w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
 /*
@@ -709,6 +693,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -719,6 +705,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -856,9 +845,12 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
@@ -909,17 +901,27 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
 #define ext4_clear_bit                  ext2_clear_bit
@@ -1085,6 +1087,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1235,24 +1238,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
+}
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
 }
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
-EXT4_INODE_BIT_FNS(flag, flags)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
-EXT4_INODE_BIT_FNS(state, state_flags)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1640,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1651,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1750,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -2044,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
@@ -2060,6 +2081,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd..2e29abb30f7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c..d8b992e658c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1f..63a75810b7c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especiially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return (ext4_ext_pblock(ex) +
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
-                                (block - le32_to_cpu(ex->ee_block)));
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2825,14 +2845,14 @@ fix_extent_len:
 * to an uninitialized extent.
 *
 * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
 * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 * Handle EOFBLOCKS_FL flag, clearing it if necessary
 */
 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
-                              struct ext4_map_blocks *map,
+                              ext4_lblk_t lblk,
                              struct ext4_ext_path *path,
                              unsigned int len)
 {
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
         * this turns out to be false, we can bail out from this
         * function immediately.
         */
-        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
            ext4_ext_get_actual_len(last_ex))
                return 0;
        /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                        path);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
-                        err = check_eofblocks_fl(handle, inode, map, path,
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
-                                                 map->m_len);
+                                                 path, map->m_len);
                } else
                        err = ret;
                goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
-                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
                if (err < 0)
                        goto out2;
        }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-        if (cache_type) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
                                ext4_ext_put_in_cache(inode, ee_block,
-                                                        ee_len, ee_start,
+                                                        ee_len, ee_start);
-                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
        if (err)
                goto out2;
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        handle_t *handle;
        loff_t new_size;
        unsigned int max_blocks;
@@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
@@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
-        /* preallocation to directories is currently not supported */
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        logical =  (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
                pgoff_t offset;
                struct page *page;
                struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddcee..2e8322c8aa8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
@@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = {
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = ext4_fallocate,
 };
 const struct inode_operations ext4_file_inode_operations = {
@@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .check_acl      = ext4_check_acl,
-        .fallocate      = ext4_fallocate,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf..7829b287822 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
 * to written.
 * The function return the number of pending IOs on success.
 */
-static int flush_completed_IO(struct inode *inode)
+extern int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
                return ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..eb9097aec6f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 19161647046..9f7f9e49914 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -53,10 +55,18 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(
+        trace_ext4_begin_ordered_truncate(inode, new_size);
-                                        EXT4_SB(inode->i_sb)->s_journal,
+        /*
-                                        &EXT4_I(inode)->jinode,
+         * If jinode is zero, then we never opened the file for
-                                        new_size);
+         * writing, so there's no need to call
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -178,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -550,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -589,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -709,9 +726,11 @@ failed_out:
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -824,6 +843,7 @@ failed:
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -1079,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1318,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1348,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1876,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2123,9 +2143,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         */
                        if (unlikely(journal_data && PageChecked(page)))
                                err = __ext4_journalled_writepage(page, len);
-                        else
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
@@ -2234,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -3357,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
-         * the pages by calling redirty_page_for_writeback() but that
+         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them becuase we wouldn't actually intend to
@@ -3715,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -3740,9 +3762,9 @@ retry:
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
 * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
@@ -4040,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4164,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4179,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -4344,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4854,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5113,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -5410,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5649,6 +5683,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1b..eb3bc2fe647 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
                return err;
        }
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ext4_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724..851f49b2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
 static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == -EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
-        return ret;
 }
 /*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, count = 0, count2 = 0;
+        int err, ret, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, entry->group,
+                        ret = ext4_issue_discard(sb, entry->group,
                                        entry->start_blk, entry->count);
+                        if (unlikely(ret == -EOPNOTSUPP)) {
+                                ext4_warning(sb, "discard not supported, "
+                                                 "disabling");
+                                clear_opt(sb, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
        }
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4640,12 +4632,9 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4720,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
-        if (ret)
-                ext4_std_error(sb, ret);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4821,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
        uint64_t start, len, minlen, trimmed;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
@@ -4830,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4853,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
                else
-                        last_block = len;
+                        last_block = first_block + len;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b72..b0a126f23c2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099..5485390d32c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if ((namelen <= 2) && (name[0] == '.') &&
-            (name[1] == '.' || name[1] == '0')) {
+            (name[1] == '.' || name[1] == '\0')) {
                /*
                 * "." or ".." will only be in the first block
                 * NFS may look up ".."; "." should be handled by the VFS
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d97..7270dcfca92 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,16 +32,24 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ              37
+#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
+        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-        if (io_page_cachep == NULL) {
+        if (io_end_cachep == NULL) {
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
+        for (i = 0; i < WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
        kmem_cache_destroy(io_page_cachep);
 }
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+static void put_io_page(struct ext4_io_page *io_page)
+{
+        if (atomic_dec_and_test(&io_page->p_count)) {
+                end_page_writeback(io_page->p_page);
+                put_page(io_page->p_page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+}
 void ext4_free_io_end(ext4_io_end_t *io)
 {
        int i;
+        wait_queue_head_t *wq;
        BUG_ON(!io);
        if (io->page)
                put_page(io->page);
-        for (i = 0; i < io->num_io_pages; i++) {
+        for (i = 0; i < io->num_io_pages; i++)
-                if (--io->pages[i]->p_count == 0) {
+                put_io_page(io->pages[i]);
-                        struct page *page = io->pages[i]->p_page;
-                        end_page_writeback(page);
-                        put_page(page);
-                        kmem_cache_free(io_page_cachep, io->pages[i]);
-                }
-        }
        io->num_io_pages = 0;
-        iput(io->inode);
+        wq = to_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
        kmem_cache_free(io_end_cachep, io);
 }
@@ -137,13 +158,10 @@ static void ext4_end_io_work(struct work_struct *work)
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-        ext4_io_end_t *io = NULL;
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
-                memset(io, 0, sizeof(*io));
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
-                io->inode = igrab(inode);
+                io->inode = inode;
-                BUG_ON(!io->inode);
                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -171,35 +189,15 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct workqueue_struct *wq;
        struct inode *inode;
        unsigned long flags;
-        ext4_fsblk_t err_block;
        int i;
        BUG_ON(!io_end);
-        inode = io_end->inode;
        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        err_block = bio->bi_sector >> (inode->i_blkbits - 9);
        bio_put(bio);
-        if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
-                pr_err("sb umounted, discard end_io request for inode %lu\n",
-                        io_end->inode->i_ino);
-                ext4_free_io_end(io_end);
-                return;
-        }
-        if (error) {
-                io_end->flag |= EXT4_IO_END_ERROR;
-                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
-                             "(offset %llu size %ld starting block %llu)",
-                             inode->i_ino,
-                             (unsigned long long) io_end->offset,
-                             (long) io_end->size,
-                             (unsigned long long) err_block);
-        }
        for (i = 0; i < io_end->num_io_pages; i++) {
                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
@@ -236,14 +234,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                        } while (bh != head);
                }
-                if (--io_end->pages[i]->p_count == 0) {
-                        struct page *page = io_end->pages[i]->p_page;
-                        end_page_writeback(page);
-                        put_page(page);
-                        kmem_cache_free(io_page_cachep, io_end->pages[i]);
-                }
                /*
                 * If this is a partial write which happened to make
                 * all buffers uptodate then we can optimize away a
@@ -253,9 +243,22 @@ static void ext4_end_bio(struct bio *bio, int error)
                 */
                if (!partial_write)
                        SetPageUptodate(page);
-        }
+                put_io_page(io_end->pages[i]);
+        }
        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bio->bi_sector >> (inode->i_blkbits - 9));
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +308,6 @@ static int io_submit_init(struct ext4_io_submit *io,
        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
-        io_end->inode = inode;
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
@@ -360,7 +362,7 @@ submit_and_retry:
        if ((io_end->num_io_pages == 0) ||
            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
                io_end->pages[io_end->num_io_pages++] = io_page;
-                io_page->p_count++;
+                atomic_inc(&io_page->p_count);
        }
        return 0;
 }
@@ -389,7 +391,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                return -ENOMEM;
        }
        io_page->p_page = page;
-        io_page->p_count = 0;
+        atomic_set(&io_page->p_count, 1);
        get_page(page);
        for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +423,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
         * PageWriteback bit from the page to prevent the system from
         * wedging later on.
         */
-        if (io_page->p_count == 0) {
+        put_io_page(io_page);
-                put_page(page);
-                end_page_writeback(page);
-                kmem_cache_free(io_page_cachep, io_page);
-        }
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..3ecc6e45d2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -232,6 +236,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                               GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
@@ -247,13 +253,20 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = input->inode_table - start;
+             i < sbi->s_itb_per_group; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -265,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -417,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -449,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -465,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
        /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -660,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -878,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af..48ce561fafa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -663,8 +673,7 @@ fail:
 */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -808,32 +817,43 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
+        atomic_set(&ei->i_ioend_count, 0);
        return &ei->vfs_inode;
 }
+static int ext4_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext4_drop_inode(inode, drop);
+        return drop;
+}
+static void ext4_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
+        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -843,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode)
                                true);
                dump_stack();
        }
-        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+        call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
@@ -881,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1016,6 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
+        if (test_opt(sb, MBLK_IO_SUBMIT))
+                seq_puts(seq, ",mblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1136,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1173,6 +1198,7 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
@@ -1186,7 +1212,6 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
-        .trim_fs        = ext4_trim_fs
 };
 static const struct super_operations ext4_nojournal_sops = {
@@ -1194,6 +1219,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
@@ -1228,8 +1254,8 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-        Opt_block_validity, Opt_noblock_validity,
+        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
@@ -1293,6 +1319,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_mblk_io_submit, "mblk_io_submit"},
+        {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1371,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1426,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1458,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1499,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1521,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1534,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1584,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1632,12 +1660,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1645,9 +1673,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1673,7 +1701,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1682,9 +1710,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1708,11 +1736,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
+                        break;
+                case Opt_mblk_io_submit:
+                        set_opt(sb, MBLK_IO_SUBMIT);
+                        break;
+                case Opt_nomblk_io_submit:
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1722,13 +1756,13 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
@@ -1752,7 +1786,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1761,24 +1795,24 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_init_inode_table:
-                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        set_opt(sb, INIT_INODE_TABLE);
                        if (args[0].from) {
                                if (match_int(&args[0], &option))
                                        return 0;
@@ -1789,7 +1823,7 @@ set_qf_format:
                        sbi->s_li_wait_mult = option;
                        break;
                case Opt_noinit_inode_table:
-                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1801,10 +1835,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1874,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return res;
 }
@@ -1909,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2699,7 +2732,6 @@ static int ext4_lazyinit_thread(void *arg)
        struct ext4_li_request *elr;
        unsigned long next_wakeup;
        DEFINE_WAIT(wait);
-        int ret;
        BUG_ON(NULL == eli);
@@ -2723,13 +2755,12 @@ cont_thread:
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);
-                        if (time_after_eq(jiffies, elr->lr_next_sched))
+                        if (time_after_eq(jiffies, elr->lr_next_sched)) {
-                                ret = ext4_run_li_request(elr);
+                                if (ext4_run_li_request(elr) != 0) {
+                                        /* error, remove the lazy_init job */
-                        if (ret) {
+                                        ext4_remove_li_request(elr);
-                                ret = 0;
+                                        continue;
-                                ext4_remove_li_request(elr);
+                                }
-                                continue;
                        }
                        if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2771,8 @@ cont_thread:
                if (freezing(current))
                        refrigerator();
-                if (time_after_eq(jiffies, next_wakeup)) {
+                if ((time_after_eq(jiffies, next_wakeup)) ||
+                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }
@@ -2788,9 +2820,6 @@ static void ext4_clear_request_list(void)
        struct ext4_li_request *elr;
        mutex_lock(&ext4_li_info->li_list_mtx);
-        if (list_empty(&ext4_li_info->li_request_list))
-                return;
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
@@ -2899,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-        int ret;
+        int ret = 0;
        if (sbi->s_li_request != NULL)
                return 0;
@@ -3054,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-                set_opt(sbi->s_mount_opt, XATTR_USER);
+                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
+                set_opt(sb, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3097,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -3105,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3257,13 +3286,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-        ret = generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
-        if (ret) {
+        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -3348,6 +3378,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext4_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext4_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext4_count_dirs(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount3;
+        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -3389,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3428,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3446,22 +3494,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
+        /*
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+         * The journal may have updated the bg summary counts, so we
-                                  ext4_count_free_blocks(sb));
+         * need to update the global counters.
-        if (!err)
+         */
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+        percpu_counter_set(&sbi->s_freeblocks_counter,
-                                          ext4_count_free_inodes(sb));
+                           ext4_count_free_blocks(sb));
-        if (!err)
+        percpu_counter_set(&sbi->s_freeinodes_counter,
-                err = percpu_counter_init(&sbi->s_dirs_counter,
+                           ext4_count_free_inodes(sb));
-                                          ext4_count_dirs(sb));
+        percpu_counter_set(&sbi->s_dirs_counter,
-        if (!err)
+                           ext4_count_dirs(sb));
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount_wq;
-        }
+no_journal:
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3523,18 +3568,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -3611,10 +3656,6 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3663,10 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3732,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext4_msg(sb, KERN_ERR,
-                        "failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -3949,13 +3987,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+                                           &EXT4_SB(sb)->s_freeblocks_counter));
-                                        &EXT4_SB(sb)->s_freeblocks_counter));
+        es->s_free_inodes_count =
-        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+                cpu_to_le32(percpu_counter_sum_positive(
-                es->s_free_inodes_count =
+                                &EXT4_SB(sb)->s_freeinodes_counter));
-                        cpu_to_le32(percpu_counter_sum_positive(
-                                        &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
@@ -4135,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -4155,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4308,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4504,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
@@ -4535,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * all updates to the file when we bypass pagecache...
         */
        if (EXT4_SB(sb)->s_journal &&
-            ext4_should_journal_data(path.dentry->d_inode)) {
+            ext4_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -4543,25 +4590,19 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-        /* Force all delayed allocation blocks to be allocated */
+        /* Force all delayed allocation blocks to be allocated.
-        if (test_opt(sb, DELALLOC)) {
+         * Caller already holds s_umount sem */
-                down_read(&sb->s_umount);
+        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
-                up_read(&sb->s_umount);
-        }
        return dquot_quota_off(sb, type);
 }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b..fc32176eee3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c2..f50408901f7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-                        const struct inode_operations *fs_dir_inode_ops, int isvfat);
+                        const struct inode_operations *fs_dir_inode_ops,
+                        int isvfat, void (*setup)(struct super_block *));
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c3..86753fe10bd 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
+static void fat_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, fat_i_callback);
+}
 static void init_once(void *foo)
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -696,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
 {
        struct inode *inode = NULL;
-        struct dentry *result;
        u32 *fh = fid->raw;
        if (fh_len < 5 || fh_type != 3)
@@ -741,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
         * the fat_iget lookup again.  If that fails, then we are totally out
         * of luck.  But all that is for another day
         */
-        result = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(result))
-                result->d_op = sb->s_root->d_op;
-        return result;
 }
 static int
@@ -792,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent))
-                parent->d_op = sb->s_root->d_op;
 out:
        unlock_super(sb);
@@ -1237,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
 * Read the super block of an MS-DOS FS.
 */
 int fat_fill_super(struct super_block *sb, void *data, int silent,
-                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
+                   const struct inode_operations *fs_dir_inode_ops, int isvfat,
+                   void (*setup)(struct super_block *))
 {
        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
@@ -1273,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        if (error)
                goto out_fail;
+        setup(sb); /* flavour-specific stuff that needs options */
        error = -EIO;
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd..711499040eb 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
 * that the existing dentry can be used. The msdos fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+               struct qstr *qstr)
 {
        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
        unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
 * Compare two msdos names. If either of the names are invalid,
 * we fall back to doing the standard name comparison.
 */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+        struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
        unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
        int error;
-        error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+        error = msdos_format_name(name->name, name->len, a_msdos_name, options);
        if (error)
                goto old_compare;
-        error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+        error = msdos_format_name(str, len, b_msdos_name, options);
        if (error)
                goto old_compare;
        error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
 old_compare:
        error = 1;
-        if (a->len == b->len)
+        if (name->len == len)
-                error = memcmp(a->name, b->name, a->len);
+                error = memcmp(name->name, str, len);
        goto out;
 }
@@ -224,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        }
 out:
        unlock_super(sb);
-        dentry->d_op = &msdos_dentry_operations;
+        return d_splice_alias(inode, dentry);
-        dentry = d_splice_alias(inode, dentry);
-        if (dentry)
-                dentry->d_op = &msdos_dentry_operations;
-        return dentry;
 error:
        unlock_super(sb);
@@ -658,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
+        sb->s_d_op = &msdos_dentry_operations;
-        lock_super(sb);
-        res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-        if (res) {
-                unlock_super(sb);
-                return res;
-        }
        sb->s_flags |= MS_NOATIME;
-        sb->s_root->d_op = &msdos_dentry_operations;
+}
-        unlock_super(sb);
-        return 0;
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+        return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+                             0, setup);
 }
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b892..f88f752babd 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /* This is not negative dentry. Always valid. */
        if (dentry->d_inode)
                return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
         *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-        unsigned int len = qstr->len;
+        while (len && name[len - 1] == '.')
-        while (len && qstr->name[len - 1] == '.')
                len--;
        return len;
 }
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+        return __vfat_striptail_len(qstr->len, qstr->name);
+}
 /*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
        return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
        unsigned int len;
        unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
 * Case insensitive compare of two vfat names.
 */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+                if (nls_strnicmp(t, name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
 * Case sensitive compare of two vfat names.
 */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (strncmp(name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -757,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        unlock_super(sb);
-        dentry->d_op = sb->s_root->d_op;
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
-        if (dentry) {
+        if (dentry)
-                dentry->d_op = sb->s_root->d_op;
                dentry->d_time = dentry->d_parent->d_inode->i_version;
-        }
        return dentry;
 error:
@@ -1051,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
-        lock_super(sb);
-        res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-        if (res) {
-                unlock_super(sb);
-                return res;
-        }
        if (MSDOS_SB(sb)->options.name_check != 's')
-                sb->s_root->d_op = &vfat_ci_dentry_ops;
+                sb->s_d_op = &vfat_ci_dentry_ops;
        else
-                sb->s_root->d_op = &vfat_dentry_ops;
+                sb->s_d_op = &vfat_dentry_ops;
+}
-        unlock_super(sb);
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
-        return 0;
+{
+        return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+                             1, setup);
 }
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b..c3e89adf53c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        struct files_struct *files = current->files;
        *fput_needed = 0;
-        if (likely((atomic_read(&files->count) == 1))) {
+        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8ee..751d6b255a1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
+        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079..2ba6719ac61 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
        return ip;
 }
+static void vxfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
 /**
 * vxfs_evict_inode - remove inode from main memory
 * @ip:         inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
        truncate_inode_pages(&ip->i_data, 0);
        end_writeback(ip);
-        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+        call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953a..59c6e495678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
        return list_entry(head, struct inode, i_wb_list);
 }
-static void bdi_queue_work(struct backing_dev_info *bdi,
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-                struct wb_writeback_work *work)
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-        trace_writeback_queue(bdi, work);
-        spin_lock_bh(&bdi->wb_lock);
-        list_add_tail(&work->list, &bdi->work_list);
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-                trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                           struct wb_writeback_work *work)
+{
+        trace_writeback_queue(bdi, work);
+        spin_lock_bh(&bdi->wb_lock);
+        list_add_tail(&work->list, &bdi->work_list);
+        if (!bdi->wb.task)
+                trace_writeback_nothread(bdi, work);
+        bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                bool range_cyclic, bool for_background)
+                      bool range_cyclic)
 {
        struct wb_writeback_work *work;
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-        work->for_background = for_background;
        bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        __bdi_start_writeback(bdi, nr_pages, true, false);
+        __bdi_start_writeback(bdi, nr_pages, true);
 }
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 * @bdi: the backing device to write from
 *
 * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   started when this function returns, we make no guarentees on
+ *   this function returns, it is only guaranteed that for given BDI
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
 */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-        __bdi_start_writeback(bdi, LONG_MAX, true, true);
+        /*
+         * We just wake up the flusher thread. It will perform background
+         * writeback as soon as there is no other work to do.
+         */
+        trace_writeback_wake_background(bdi);
+        spin_lock_bh(&bdi->wb_lock);
+        bdi_wakeup_flusher(bdi);
+        spin_unlock_bh(&bdi->wb_lock);
 }
 /*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+        long write_chunk;
        struct inode *inode;
        if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
+        /*
+         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+         * here avoids calling into writeback_inodes_wb() more than once.
+         *
+         * The intended call sequence for WB_SYNC_ALL writeback is:
+         *
+         *      wb_writeback()
+         *          __writeback_inodes_sb()     <== called only once
+         *              write_cache_pages()     <== called once for each inode
+         *                   (quickly) tag currently dirty pages
+         *                   (maybe slowly) sync all tagged pages
+         */
+        if (wbc.sync_mode == WB_SYNC_NONE)
+                write_chunk = MAX_WRITEBACK_PAGES;
+        else
+                write_chunk = LONG_MAX;
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                /*
+                 * Background writeout and kupdate-style writeback may
+                 * run forever. Stop them if there is other work to do
+                 * so that e.g. sync can proceed. They'll be restarted
+                 * after the other works are all done.
+                 */
+                if ((work->for_background || work->for_kupdate) &&
+                    !list_empty(&wb->bdi->work_list))
+                        break;
+                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
-                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                work->nr_pages -= write_chunk - wbc.nr_to_write;
-                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                wrote += write_chunk - wbc.nr_to_write;
                /*
                 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-                if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+                if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
                get_nr_dirty_inodes();
 }
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+        if (over_bground_thresh()) {
+                struct wb_writeback_work work = {
+                        .nr_pages       = LONG_MAX,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_background = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &work);
+        }
+        return 0;
+}
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false, false);
+                __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
 */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3..78b519c1353 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+static inline void path_get_longterm(struct path *path)
+{
+        path_get(path);
+        mnt_make_longterm(path->mnt);
+}
+static inline void path_put_longterm(struct path *path)
+{
+        mnt_make_shortterm(path->mnt);
+        path_put(path);
+}
 /*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -14,12 +27,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
        struct path old_root;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
-        path_get(path);
+        path_get_longterm(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
-                path_put(&old_root);
+                path_put_longterm(&old_root);
 }
 /*
@@ -31,13 +46,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
        struct path old_pwd;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
-        path_get(path);
+        path_get_longterm(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
-                path_put(&old_pwd);
+                path_put_longterm(&old_pwd);
 }
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +69,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                fs = p->fs;
                if (fs) {
                        spin_lock(&fs->lock);
+                        write_seqcount_begin(&fs->seq);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_longterm(new_root);
                                fs->root = *new_root;
                                count++;
                        }
                        if (fs->pwd.dentry == old_root->dentry
                            && fs->pwd.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_longterm(new_root);
                                fs->pwd = *new_root;
                                count++;
                        }
+                        write_seqcount_end(&fs->seq);
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
-                path_put(old_root);
+                path_put_longterm(old_root);
 }
 void free_fs_struct(struct fs_struct *fs)
 {
-        path_put(&fs->root);
+        path_put_longterm(&fs->root);
-        path_put(&fs->pwd);
+        path_put_longterm(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
 }
@@ -88,8 +107,10 @@ void exit_fs(struct task_struct *tsk)
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
+                write_seqcount_begin(&fs->seq);
                tsk->fs = NULL;
                kill = !--fs->users;
+                write_seqcount_end(&fs->seq);
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
@@ -105,8 +126,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
+                seqcount_init(&fs->seq);
                fs->umask = old->umask;
-                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
+                spin_lock(&old->lock);
+                fs->root = old->root;
+                path_get_longterm(&fs->root);
+                fs->pwd = old->pwd;
+                path_get_longterm(&fs->pwd);
+                spin_unlock(&old->lock);
        }
        return fs;
 }
@@ -144,6 +172,7 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
        .users          = 1,
        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
+        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede0..48a18f184d5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
-                if (object->n_ops > 0) {
+                if (object->n_ops > 1) {
                        atomic_inc(&op->usage);
                        list_add_tail(&op->pend_link, &object->pending_ops);
                        fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308d..cf8d28d1fba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+                       u64 nodeid, u64 nlookup)
+{
+        forget->forget_one.nodeid = nodeid;
+        forget->forget_one.nlookup = nlookup;
+        spin_lock(&fc->lock);
+        fc->forget_list_tail->next = forget;
+        fc->forget_list_tail = forget;
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        spin_unlock(&fc->lock);
+}
 static void flush_bg_queue(struct fuse_conn *fc)
 {
        while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-        req->isreply = 0;
-        fuse_request_send_nowait(fc, req);
-}
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int forget_pending(struct fuse_conn *fc)
+{
+        return fc->forget_list_head.next != NULL;
+}
 static int request_pending(struct fuse_conn *fc)
 {
-        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+                forget_pending(fc);
 }
 /* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+                                               unsigned max,
+                                               unsigned *countp)
+{
+        struct fuse_forget_link *head = fc->forget_list_head.next;
+        struct fuse_forget_link **newhead = &head;
+        unsigned count;
+        for (count = 0; *newhead != NULL && count < max; count++)
+                newhead = &(*newhead)->next;
+        fc->forget_list_head.next = *newhead;
+        *newhead = NULL;
+        if (fc->forget_list_head.next == NULL)
+                fc->forget_list_tail = &fc->forget_list_head;
+        if (countp != NULL)
+                *countp = count;
+        return head;
+}
+static int fuse_read_single_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs,
+                                   size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+        struct fuse_forget_in arg = {
+                .nlookup = forget->forget_one.nlookup,
+        };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_FORGET,
+                .nodeid = forget->forget_one.nodeid,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        spin_unlock(&fc->lock);
+        kfree(forget);
+        if (nbytes < ih.len)
+                return -EINVAL;
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        unsigned max_forgets;
+        unsigned count;
+        struct fuse_forget_link *head;
+        struct fuse_batch_forget_in arg = { .count = 0 };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_BATCH_FORGET,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        if (nbytes < ih.len) {
+                spin_unlock(&fc->lock);
+                return -EINVAL;
+        }
+        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+        head = dequeue_forget(fc, max_forgets, &count);
+        spin_unlock(&fc->lock);
+        arg.count = count;
+        ih.len += count * sizeof(struct fuse_forget_one);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        while (head) {
+                struct fuse_forget_link *forget = head;
+                if (!err) {
+                        err = fuse_copy_one(cs, &forget->forget_one,
+                                            sizeof(forget->forget_one));
+                }
+                head = forget->next;
+                kfree(forget);
+        }
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+                            size_t nbytes)
+__releases(fc->lock)
+{
+        if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+                return fuse_read_single_forget(fc, cs, nbytes);
+        else
+                return fuse_read_batch_forget(fc, cs, nbytes);
+}
 /*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
+        if (forget_pending(fc)) {
+                if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+                        return fuse_read_forget(fc, cs, nbytes);
+                if (fc->forget_batch <= -8)
+                        fc->forget_batch = 16;
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
        flush_bg_queue(fc);
        end_requests(fc, &fc->pending);
        end_requests(fc, &fc->processing);
+        while (forget_pending(fc))
+                kfree(dequeue_forget(fc, 1, NULL));
 }
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482..bfed8447ed8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
 */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-        struct inode *inode = entry->d_inode;
+        struct inode *inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
-                struct fuse_req *forget_req;
+                struct fuse_forget_link *forget;
                struct dentry *parent;
                u64 attr_version;
@@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                forget_req = fuse_get_req(fc);
+                forget = fuse_alloc_forget();
-                if (IS_ERR(forget_req)) {
+                if (!forget) {
                        fuse_put_request(fc, req);
                        return 0;
                }
@@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!err) {
                        struct fuse_inode *fi = get_fuse_inode(inode);
                        if (outarg.nodeid != get_node_id(inode)) {
-                                fuse_send_forget(fc, forget_req,
+                                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-                                                 outarg.nodeid, 1);
                                return 0;
                        }
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
-                fuse_put_request(fc, forget_req);
+                kfree(forget);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        return 0;
@@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        u64 attr_version;
        int err;
@@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (IS_ERR(req))
                goto out;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        err = PTR_ERR(forget_req);
+        err = -ENOMEM;
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
                goto out;
        }
@@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                           attr_version);
        err = -ENOMEM;
        if (!*inode) {
-                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg->nodeid, 1);
                goto out;
        }
        err = 0;
 out_put_forget:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
 out:
        return err;
 }
@@ -347,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        }
        entry = newent ? newent : entry;
-        entry->d_op = &fuse_dentry_operations;
        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
@@ -374,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -388,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (flags & O_DIRECT)
                return -EINVAL;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req))
+        if (!forget)
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        req = fuse_get_req(fc);
        err = PTR_ERR(req);
@@ -448,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                fuse_sync_release(ff, flags);
-                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+                fuse_queue_forget(fc, forget, outentry.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        d_instantiate(entry, inode);
        fuse_change_entry_timeout(entry, &outentry);
        fuse_invalidate_attr(dir);
@@ -469,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 out_put_request:
        fuse_put_request(fc, req);
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -483,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        struct fuse_entry_out outarg;
        struct inode *inode;
        int err;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        }
        memset(&outarg, 0, sizeof(outarg));
@@ -515,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                          &outarg.attr, entry_attr_timeout(&outarg), 0);
        if (!inode) {
-                fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -541,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        return 0;
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -981,12 +983,15 @@ static int fuse_access(struct inode *inode, int mask)
 * access request is sent.  Execute permission is still checked
 * locally based on file mode.
 */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        bool refreshed = false;
        int err = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1001,7 +1006,7 @@ static int fuse_permission(struct inode *inode, int mask)
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
                /* If permission is denied, try to refresh file
                   attributes.  This is also needed, because the root
@@ -1009,7 +1014,8 @@ static int fuse_permission(struct inode *inode, int mask)
                if (err == -EACCES && !refreshed) {
                        err = fuse_do_getattr(inode, NULL, NULL);
                        if (!err)
-                                err = generic_permission(inode, mask, NULL);
+                                err = generic_permission(inode, mask,
+                                                        flags, NULL);
                }
                /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123..95da1bc1c82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
                invalidate_inode_pages2(inode->i_mapping);
        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
+        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                spin_lock(&fc->lock);
+                fi->attr_version = ++fc->attr_version;
+                i_size_write(inode, 0);
+                spin_unlock(&fc->lock);
+                fuse_invalidate_attr(inode);
+        }
 }
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,94 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 }
 /*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+                                     size_t transferred, unsigned count,
+                                     bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+        if (count * sizeof(struct compat_iovec) == transferred) {
+                struct compat_iovec *ciov = src;
+                unsigned i;
+                /*
+                 * With this interface a 32bit server cannot support
+                 * non-compat (i.e. ones coming from 64bit apps) ioctl
+                 * requests
+                 */
+                if (!is_compat)
+                        return -EINVAL;
+                for (i = 0; i < count; i++) {
+                        dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+                        dst[i].iov_len = ciov[i].iov_len;
+                }
+                return 0;
+        }
+#endif
+        if (count * sizeof(struct iovec) != transferred)
+                return -EIO;
+        memcpy(dst, src, transferred);
+        return 0;
+}
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+        size_t n;
+        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+        for (n = 0; n < count; n++) {
+                if (iov->iov_len > (size_t) max)
+                        return -ENOMEM;
+                max -= iov->iov_len;
+        }
+        return 0;
+}
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+                                 void *src, size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+        unsigned i;
+        struct fuse_ioctl_iovec *fiov = src;
+        if (fc->minor < 16) {
+                return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+                                                 count, is_compat);
+        }
+        if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+                return -EIO;
+        for (i = 0; i < count; i++) {
+                /* Did the server supply an inappropriate value? */
+                if (fiov[i].base != (unsigned long) fiov[i].base ||
+                    fiov[i].len != (unsigned long) fiov[i].len)
+                        return -EIO;
+                dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+                dst[i].iov_len = (size_t) fiov[i].len;
+#ifdef CONFIG_COMPAT
+                if (is_compat &&
+                    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+                     (compat_size_t) dst[i].iov_len != fiov[i].len))
+                        return -EIO;
+#endif
+        }
+        return 0;
+}
+/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
@@ -1677,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        struct fuse_ioctl_out outarg;
        struct fuse_req *req = NULL;
        struct page **pages = NULL;
-        struct page *iov_page = NULL;
+        struct iovec *iov_page = NULL;
        struct iovec *in_iov = NULL, *out_iov = NULL;
        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
        size_t in_size, out_size, transferred;
        int err;
+#if BITS_PER_LONG == 32
+        inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+        if (flags & FUSE_IOCTL_COMPAT)
+                inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
        /* assume all the iovs returned by client always fits in a page */
-        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-        iov_page = alloc_page(GFP_KERNEL);
+        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1697,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         * RETRY from server is not allowed.
         */
        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-                struct iovec *iov = page_address(iov_page);
+                struct iovec *iov = iov_page;
                iov->iov_base = (void __user *)arg;
                iov->iov_len = _IOC_SIZE(cmd);
@@ -1778,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        /* did it ask for retry? */
        if (outarg.flags & FUSE_IOCTL_RETRY) {
-                char *vaddr;
+                void *vaddr;
                /* no retry if in restricted mode */
                err = -EIO;
@@ -1798,18 +1904,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
                        goto out;
-                err = -EIO;
-                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
-                        goto out;
-                /* okay, copy in iovs and retry */
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                memcpy(page_address(iov_page), vaddr, transferred);
+                err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+                                            transferred, in_iovs + out_iovs,
+                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
+                if (err)
+                        goto out;
-                in_iov = page_address(iov_page);
+                in_iov = iov_page;
                out_iov = in_iov + in_iovs;
+                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+                if (err)
+                        goto out;
+                err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+                if (err)
+                        goto out;
                goto retry;
        }
@@ -1821,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 out:
        if (req)
                fuse_put_request(fc, req);
-        if (iov_page)
+        free_page((unsigned long) iov_page);
-                __free_page(iov_page);
        while (num_pages)
                __free_page(pages[--num_pages]);
        kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f10..ae5744a2f9e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
+/* One forget request */
+struct fuse_forget_link {
+        struct fuse_forget_one forget_one;
+        struct fuse_forget_link *next;
+};
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
        u64 nlookup;
        /** The request used for sending the FORGET message */
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        /** Time in jiffies until the file attributes are valid */
        u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
-                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
                        struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
        /** Pending interrupts */
        struct list_head interrupts;
+        /** Queue of pending forgets */
+        struct fuse_forget_link forget_list_head;
+        struct fuse_forget_link *forget_list_tail;
+        /** Batching of FORGET requests (positive indicates FORGET batch) */
+        int forget_batch;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
 * Send FORGET command
 */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
-                      u64 nodeid, u64 nlookup);
+                       u64 nodeid, u64 nlookup);
+struct fuse_forget_link *fuse_alloc_forget(void);
 /**
 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-/**
 * Send a request in the background
 */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a9..9e3f68cc1bd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
        unsigned blksize;
 };
+struct fuse_forget_link *fuse_alloc_forget()
+{
+        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
        struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
        init_waitqueue_head(&fi->page_waitq);
-        fi->forget_req = fuse_request_alloc();
+        fi->forget = fuse_alloc_forget();
-        if (!fi->forget_req) {
+        if (!fi->forget) {
                kmem_cache_free(fuse_inode_cachep, inode);
                return NULL;
        }
@@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void fuse_destroy_inode(struct inode *inode)
+static void fuse_i_callback(struct rcu_head *head)
 {
-        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct inode *inode = container_of(head, struct inode, i_rcu);
-        BUG_ON(!list_empty(&fi->write_files));
+        INIT_LIST_HEAD(&inode->i_dentry);
-        BUG_ON(!list_empty(&fi->queued_writes));
-        if (fi->forget_req)
-                fuse_request_free(fi->forget_req);
        kmem_cache_free(fuse_inode_cachep, inode);
 }
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+static void fuse_destroy_inode(struct inode *inode)
-                      u64 nodeid, u64 nlookup)
 {
-        struct fuse_forget_in *inarg = &req->misc.forget_in;
+        struct fuse_inode *fi = get_fuse_inode(inode);
-        inarg->nlookup = nlookup;
+        BUG_ON(!list_empty(&fi->write_files));
-        req->in.h.opcode = FUSE_FORGET;
+        BUG_ON(!list_empty(&fi->queued_writes));
-        req->in.h.nodeid = nodeid;
+        kfree(fi->forget);
-        req->in.numargs = 1;
+        call_rcu(&inode->i_rcu, fuse_i_callback);
-        req->in.args[0].size = sizeof(struct fuse_forget_in);
-        req->in.args[0].value = inarg;
-        fuse_request_send_noreply(fc, req);
 }
 static void fuse_evict_inode(struct inode *inode)
@@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
-                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-                fi->forget_req = NULL;
+                fi->forget = NULL;
        }
 }
@@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->interrupts);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
+        fc->forget_list_tail = &fc->forget_list_head;
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -618,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
                goto out_iput;
        entry = d_obtain_alias(inode);
-        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
-                entry->d_op = &fuse_dentry_operations;
                fuse_invalidate_entry_cache(entry);
-        }
        return entry;
@@ -720,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
-                parent->d_op = &fuse_dentry_operations;
                fuse_invalidate_entry_cache(parent);
-        }
        return parent;
 }
@@ -990,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
                iput(root);
                goto err_put_conn;
        }
+        /* only now - we want root dentry with NULL ->d_op */
+        sb->s_d_op = &fuse_dentry_operations;
        init_req = fuse_request_alloc();
        if (!init_req)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a69..06c48a89183 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
 }
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
-        if (acl) {
+                        return -ECHILD;
-                int error = posix_acl_permission(inode, acl, mask);
+        } else {
-                posix_acl_release(acl);
+                struct posix_acl *acl;
-                return error;
+                acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+                if (acl) {
+                        int error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                        return error;
+                }
        }
        return -EAGAIN;
 }
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943..7118f1a780a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
 * Returns: errno
 */
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39e..a93907c8159 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES            25
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4e..3c4039d5eef 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-        int error;
+        int error = 0;
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        else if (!sdp->sd_rgrps)
+                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b385..4a456338b87 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 #include "gfs2.h"
@@ -34,15 +35,23 @@
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *parent = dget_parent(dentry);
+        struct dentry *parent;
-        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_sbd *sdp;
-        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct gfs2_inode *dip;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
        int had_lock = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        parent = dget_parent(dentry);
+        sdp = GFS2_SB(parent->d_inode);
+        dip = GFS2_I(parent->d_inode);
+        inode = dentry->d_inode;
        if (inode) {
                if (is_bad_inode(inode))
                        goto invalid;
@@ -100,13 +109,14 @@ fail:
        return 0;
 }
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        str->hash = gfs2_disk_hash(str->name, str->len);
        return 0;
 }
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
        struct gfs2_inode *ginode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d3..9023db8184f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,22 +126,14 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-        struct dentry *dentry;
+        return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-        dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
-        return dentry;
 }
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh;
        struct inode *inode;
-        struct dentry *dentry;
-        int error;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -152,52 +144,13 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                goto out_inode;
        }
-        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+        inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                    GFS2_BLKST_DINODE);
-        if (error)
+        if (IS_ERR(inode))
-                return ERR_PTR(error);
+                return ERR_CAST(inode);
-        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-        if (error)
-                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                goto fail;
-        }
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error) {
-                iput(inode);
-                goto fail;
-        }
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
-        if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-                iput(inode);
-                goto fail;
-        }
-        error = -EIO;
-        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-                iput(inode);
-                goto fail;
-        }
-        gfs2_glock_dq_uninit(&i_gh);
 out_inode:
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
-        return dentry;
-fail:
-        gfs2_glock_dq_uninit(&i_gh);
-        return ERR_PTR(error);
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5..7cfdcb91336 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -241,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = gfs2_permission(inode, MAY_WRITE);
+                error = gfs2_permission(inode, MAY_WRITE, 0);
                if (error)
                        goto out;
        }
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
+static void empty_write_end(struct page *page, unsigned from,
+                           unsigned to)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        page_zero_new_buffers(page, from, to);
+        flush_dcache_page(page);
+        mark_page_accessed(page);
+        if (!gfs2_is_writeback(ip))
+                gfs2_page_add_databufs(ip, page, from, to);
+        block_commit_write(page, from, to);
+}
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+        unsigned start, end, next;
+        struct buffer_head *bh, *head;
+        int error;
+        if (!page_has_buffers(page)) {
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, from, to);
+                return 0;
+        }
+        bh = head = page_buffers(page);
+        next = end = 0;
+        while (next < from) {
+                next += bh->b_size;
+                bh = bh->b_this_page;
+        }
+        start = next;
+        do {
+                next += bh->b_size;
+                if (buffer_mapped(bh)) {
+                        if (end) {
+                                error = __block_write_begin(page, start, end - start,
+                                                            gfs2_block_map);
+                                if (unlikely(error))
+                                        return error;
+                                empty_write_end(page, start, end);
+                                end = 0;
+                        }
+                        start = next;
+                }
+                else
+                        end = next;
+                bh = bh->b_this_page;
+        } while (next < to);
+        if (end) {
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, start, end);
+        }
+        return 0;
+}
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                           int mode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *dibh;
+        int error;
+        u64 start = offset >> PAGE_CACHE_SHIFT;
+        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t curr;
+        struct page *page;
+        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+        unsigned int from, to;
+        if (!end_offset)
+                end_offset = PAGE_CACHE_SIZE;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (unlikely(error))
+                goto out;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (unlikely(error))
+                        goto out;
+        }
+        curr = start;
+        offset = start << PAGE_CACHE_SHIFT;
+        from = start_offset;
+        to = PAGE_CACHE_SIZE;
+        while (curr <= end) {
+                page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                   AOP_FLAG_NOFS);
+                if (unlikely(!page)) {
+                        error = -ENOMEM;
+                        goto out;
+                }
+                if (curr == end)
+                        to = end_offset;
+                error = write_empty_blocks(page, from, to);
+                if (!error && offset + to > inode->i_size &&
+                    !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        i_size_write(inode, offset + to);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+                if (error)
+                        goto out;
+                curr++;
+                offset += PAGE_CACHE_SIZE;
+                from = 0;
+        }
+        gfs2_dinode_out(ip, dibh->b_data);
+        mark_inode_dirty(inode);
+        brelse(dibh);
+out:
+        return error;
+}
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                max_data -= tmp;
+        }
+        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+           so it might end up with fewer data blocks */
+        if (max_data <= *data_blocks)
+                return;
+        *data_blocks = max_data;
+        *ind_blocks = max_blocks - max_data;
+        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+        if (*len > max) {
+                *len = max;
+                gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+        }
+}
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+                           loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+        loff_t bytes, max_bytes;
+        struct gfs2_alloc *al;
+        int error;
+        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+                 sdp->sd_sb.sb_bsize_shift;
+        len = next - offset;
+        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+        if (!bytes)
+                bytes = UINT_MAX;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+        error = gfs2_glock_nq(&ip->i_gh);
+        if (unlikely(error))
+                goto out_uninit;
+        if (!gfs2_write_alloc_required(ip, offset, len))
+                goto out_unlock;
+        while (len > 0) {
+                if (len < bytes)
+                        bytes = len;
+                al = gfs2_alloc_get(ip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_unlock;
+                }
+                error = gfs2_quota_lock_check(ip);
+                if (error)
+                        goto out_alloc_put;
+retry:
+                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error) {
+                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                                bytes >>= 1;
+                                goto retry;
+                        }
+                        goto out_qunlock;
+                }
+                max_bytes = bytes;
+                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                          RES_RG_HDR + gfs2_rg_blocks(al);
+                if (gfs2_is_jdata(ip))
+                        rblocks += data_blocks ? data_blocks : 1;
+                error = gfs2_trans_begin(sdp, rblocks,
+                                         PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+                if (error)
+                        goto out_trans_fail;
+                error = fallocate_chunk(inode, offset, max_bytes, mode);
+                gfs2_trans_end(sdp);
+                if (error)
+                        goto out_trans_fail;
+                len -= max_bytes;
+                offset += max_bytes;
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        goto out_unlock;
+out_trans_fail:
+        gfs2_inplace_release(ip);
+out_qunlock:
+        gfs2_quota_unlock(ip);
+out_alloc_put:
+        gfs2_alloc_put(ip);
+out_unlock:
+        gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+        gfs2_holder_uninit(&ip->i_gh);
+        return error;
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = gfs2_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = generic_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f09..08a8beb152e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = LM_OUT_ERROR;
-        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                         req_state, flags);
-        return ret;
-}
 /**
 * do_xmote - Calls the DLM to change the state of a lock
 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-        BUG_ON(gl->gl_state == target);
+        GLOCK_BUG_ON(gl, gl->gl_state == target);
-        BUG_ON(gl->gl_state == gl->gl_target);
+        GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+        gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-        ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
-        if (!(ret & LM_OUT_ASYNC)) {
+        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
-                finish_xmote(gl, ret);
+                /* lock_dlm */
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+                GLOCK_BUG_ON(gl, ret);
+        } else { /* lock_nolock */
+                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-        } else {
-                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
        spin_lock(&gl->gl_spin);
 }
@@ -686,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = NULL;
+        struct gfs2_inode *ip;
        struct inode *inode;
-        u64 no_addr = 0;
+        u64 no_addr = gl->gl_name.ln_number;
+        ip = gl->gl_object;
+        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-        spin_lock(&gl->gl_spin);
-        ip = (struct gfs2_inode *)gl->gl_object;
        if (ip)
-                no_addr = ip->i_no_addr;
-        spin_unlock(&gl->gl_spin);
-        if (ip) {
                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-                if (inode) {
+        else
-                        d_prune_aliases(inode);
+                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-                        iput(inode);
+        if (inode && !IS_ERR(inode)) {
-                }
+                d_prune_aliases(inode);
+                iput(inode);
        }
        gfs2_glock_put(gl);
 }
@@ -952,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-                printk(KERN_ERR " ");
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
+                printk(KERN_ERR " %pV", &vaf);
        }
        va_end(args);
 }
@@ -1362,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
 *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
 */
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-                spin_unlock(&gl->gl_spin);
        }
+        spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+        smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1627,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-        char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
-        sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-                  state2str(gh->gh_state),
+                       state2str(gh->gh_state),
-                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                  gh->gh_error, 
+                       gh->gh_error,
-                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                  gh_owner ? gh_owner->comm : "(ended)", buffer);
+                       gh_owner ? gh_owner->comm : "(ended)",
+                       (void *)gh->gh_ip);
        return 0;
 }
@@ -1783,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_FREEZEABLE, 0);
+                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d22..691851ceb61 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
  
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
 *
 * LM_OUT_ST_MASK
 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
 * LM_OUT_CANCELED
 * The lock request was canceled.
 *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
 */
 #define LM_OUT_ST_MASK          0x00000003
 #define LM_OUT_CANCELED         0x00000008
-#define LM_OUT_ASYNC            0x00000080
+#define LM_OUT_ERROR            0x00000004
-#define LM_OUT_ERROR            0x00000100
 /*
 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-        unsigned int (*lm_lock) (struct gfs2_glock *gl,
+        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
-                                 unsigned int req_state, unsigned int flags);
+                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
-#define LM_FLAG_TRY             0x00000001
-#define LM_FLAG_TRY_1CB         0x00000002
-#define LM_FLAG_NOEXP           0x00000004
-#define LM_FLAG_ANY             0x00000008
-#define LM_FLAG_PRIORITY        0x00000010
-#define GL_ASYNC                0x00000040
-#define GL_EXACT                0x00000080
-#define GL_SKIP                 0x00000100
-#define GL_NOCACHE              0x00000400
-#define GLR_TRYFAILED           13
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e..263561bf1a5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc..a79790c0627 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -207,12 +208,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
-        unsigned int gl_state;
+        /* State fields protected by gl_spin */
-        unsigned int gl_target;
+        unsigned int gl_state:2,        /* Current state */
-        unsigned int gl_reply;
+                     gl_target:2,       /* Target state */
+                     gl_demote_state:2, /* State requested by remote node */
+                     gl_req:2,          /* State in last dlm request */
+                     gl_reply:8;        /* Last reply from the dlm */
        unsigned int gl_hash;
-        unsigned int gl_req;
-        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8c..7aa7d4f8984 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,60 +73,15 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
-struct gfs2_skip_data {
-        u64     no_addr;
-        int     skipped;
-};
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return 1;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-                                    u64 no_addr)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
 /**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * gfs2_set_iop - Sets inode operations
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
+ * @inode: The inode with correct i_mode filled in
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
 *
- * Clears I_NEW as well.
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
- **/
+ * from directory entry inside gfs2_inode_lookup().
+ */
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        umode_t mode = inode->i_mode;
@@ -149,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
                inode->i_op = &gfs2_file_iops;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        }
-        unlock_new_inode(inode);
 }
 /**
@@ -162,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
 * Returns: A VFS inode, or an error
 */
-struct inode *gfs2_inode_lookup(struct super_block *sb,
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                unsigned int type,
+                                u64 no_addr, u64 no_formal_ino)
-                                u64 no_addr,
-                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
@@ -195,141 +146,80 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
                if (unlikely(error))
                        goto fail_iopen;
-                ip->i_iopen_gh.gh_gl->gl_object = ip;
+                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
                io_gl = NULL;
-                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
-                        goto gfs2_nfsbypass;
-                inode->i_mode = DT2IF(type);
-                /*
-                 * We must read the inode in order to work out its type in
-                 * this case. Note that this doesn't happen often as we normally
-                 * know the type beforehand. This code path only occurs during
-                 * unlinked inode recovery (where it is safe to do this glock,
-                 * which is not true in the general case).
-                 */
                if (type == DT_UNKNOWN) {
-                        struct gfs2_holder gh;
+                        /* Inode glock must be locked already */
-                        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+                        error = gfs2_inode_refresh(GFS2_I(inode));
-                        if (unlikely(error))
+                        if (error)
-                                goto fail_glock;
+                                goto fail_refresh;
-                        /* Inode is now uptodate */
+                } else {
-                        gfs2_glock_dq_uninit(&gh);
+                        inode->i_mode = DT2IF(type);
                }
                gfs2_set_iop(inode);
+                unlock_new_inode(inode);
        }
-gfs2_nfsbypass:
        return inode;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_refresh:
+        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
        if (io_gl)
                gfs2_glock_put(io_gl);
 fail_put:
-        if (inode->i_state & I_NEW)
+        ip->i_gl->gl_object = NULL;
-                ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
-        if (inode->i_state & I_NEW)
+        iget_failed(inode);
-                iget_failed(inode);
-        else
-                iput(inode);
        return ERR_PTR(error);
 }
-/**
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+                                  u64 *no_formal_ino, unsigned int blktype)
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
-        struct gfs2_sbd *sdp;
+        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
-        struct gfs2_glock *io_gl = NULL;
-        int error;
-        struct gfs2_holder gh;
        struct inode *inode;
+        int error;
-        inode = gfs2_iget_skip(sb, no_addr);
+        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (!inode)
+        if (error)
-                return;
+                return ERR_PTR(error);
-        /* If it's not a new inode, someone's using it, so leave it alone. */
-        if (!(inode->i_state & I_NEW)) {
-                iput(inode);
-                return;
-        }
-        ip = GFS2_I(inode);
-        sdp = GFS2_SB(inode);
-        ip->i_no_formal_ino = -1;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        error = gfs2_check_blk_type(sdp, no_addr, blktype);
-        if (unlikely(error))
+        if (error)
                goto fail;
-        ip->i_gl->gl_object = ip;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-        if (unlikely(error))
-                goto fail_put;
-        set_bit(GIF_INVALID, &ip->i_flags);
-        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
-                                   &ip->i_iopen_gh);
-        if (unlikely(error))
-                goto fail_iopen;
-        ip->i_iopen_gh.gh_gl->gl_object = ip;
-        gfs2_glock_put(io_gl);
-        io_gl = NULL;
-        inode->i_mode = DT2IF(DT_UNKNOWN);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+        if (IS_ERR(inode))
+                goto fail;
-        /*
+        /* Two extra checks for NFS only */
-         * We must read the inode in order to work out its type in
+        if (no_formal_ino) {
-         * this case. Note that this doesn't happen often as we normally
+                error = -ESTALE;
-         * know the type beforehand. This code path only occurs during
+                if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
-         * unlinked inode recovery (where it is safe to do this glock,
+                        goto fail_iput;
-         * which is not true in the general case).
-         */
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-                                   &gh);
-        if (unlikely(error))
-                goto fail_glock;
-        /* Inode is now uptodate */
+                error = -EIO;
-        gfs2_glock_dq_uninit(&gh);
+                if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
-        gfs2_set_iop(inode);
+                        goto fail_iput;
-        /* The iput will cause it to be deleted. */
+                error = 0;
-        iput(inode);
+        }
-        return;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-        if (io_gl)
-                gfs2_glock_put(io_gl);
-fail_put:
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        gfs2_glock_dq_uninit(&i_gh);
-        return;
+        return error ? ERR_PTR(error) : inode;
+fail_iput:
+        iput(inode);
+        goto fail;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -591,7 +481,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = gfs2_permission(dir, MAY_EXEC);
+                error = gfs2_permission(dir, MAY_EXEC, 0);
                if (error)
                        goto out;
        }
@@ -621,7 +511,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -998,17 +888,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc..3e00a66e7cb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,10 +96,11 @@ err:
        return -EIO;
 }
-extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+                                         u64 *no_formal_ino,
+                                         unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
@@ -111,7 +112,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
                                  const struct qstr *name,
                                  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45f..6e493aee28f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
-                              unsigned int req_state, unsigned int flags)
+                     unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-        int error;
        int req;
        u32 lkf;
-        gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
-        error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+        return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                         GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-        if (error == -EAGAIN)
-                return 0;
-        if (error)
-                return LM_OUT_ERROR;
-        return LM_OUT_ASYNC;
 }
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b8..777927ce6f7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
                iput(inode);
                return -ENOMEM;
        }
-        dentry->d_op = &gfs2_dops;
        *dptr = dentry;
        return 0;
 }
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
+        sb->s_d_op = &gfs2_dops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
        sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error;
        struct gfs2_args args;
        struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
                goto error_bdev;
        if (s->s_root)
-                close_bdev_exclusive(bdev, mode);
+                blkdev_put(bdev, mode);
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
        deactivate_locked_super(s);
        return ERR_PTR(error);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
        return ERR_PTR(error);
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c..d8b26ac2e20 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode = NULL;
-        dentry->d_op = &gfs2_dops;
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
                return ERR_CAST(inode);
@@ -166,7 +162,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_child;
-        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                goto out_gunlock;
@@ -289,7 +285,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -822,7 +818,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
                if (error)
                        goto out_gunlock;
@@ -857,7 +853,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
                if (error)
                        goto out_gunlock;
        }
@@ -1041,13 +1037,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 * Returns: errno
 */
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip;
        struct gfs2_holder i_gh;
        int error;
        int unlock = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
@@ -1058,7 +1058,7 @@ int gfs2_permission(struct inode *inode, int mask)
        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                error = -EACCES;
        else
-                error = generic_permission(inode, mask, gfs2_check_acl);
+                error = generic_permission(inode, mask, flags, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(sdp, !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
@@ -1271,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
        return ret;
 }
-static void empty_write_end(struct page *page, unsigned from,
-                           unsigned to)
-{
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
-        flush_dcache_page(page);
-        mark_page_accessed(page);
-        if (!gfs2_is_writeback(ip))
-                gfs2_page_add_databufs(ip, page, from, to);
-        block_commit_write(page, from, to);
-}
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
-        int error;
-        if (!page_has_buffers(page)) {
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
-                if (unlikely(error))
-                        return error;
-                empty_write_end(page, from, to);
-                return 0;
-        }
-        bh = head = page_buffers(page);
-        next = end = 0;
-        while (next < from) {
-                next += bh->b_size;
-                bh = bh->b_this_page;
-        }
-        start = next;
-        do {
-                next += bh->b_size;
-                if (buffer_mapped(bh)) {
-                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
-                                if (unlikely(error))
-                                        return error;
-                                empty_write_end(page, start, end);
-                                end = 0;
-                        }
-                        start = next;
-                }
-                else
-                        end = next;
-                bh = bh->b_this_page;
-        } while (next < to);
-        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
-                        return error;
-                empty_write_end(page, start, end);
-        }
-        return 0;
-}
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-                           int mode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *dibh;
-        int error;
-        u64 start = offset >> PAGE_CACHE_SHIFT;
-        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t curr;
-        struct page *page;
-        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-        unsigned int from, to;
-        if (!end_offset)
-                end_offset = PAGE_CACHE_SIZE;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (unlikely(error))
-                goto out;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        if (gfs2_is_stuffed(ip)) {
-                error = gfs2_unstuff_dinode(ip, NULL);
-                if (unlikely(error))
-                        goto out;
-        }
-        curr = start;
-        offset = start << PAGE_CACHE_SHIFT;
-        from = start_offset;
-        to = PAGE_CACHE_SIZE;
-        while (curr <= end) {
-                page = grab_cache_page_write_begin(inode->i_mapping, curr,
-                                                   AOP_FLAG_NOFS);
-                if (unlikely(!page)) {
-                        error = -ENOMEM;
-                        goto out;
-                }
-                if (curr == end)
-                        to = end_offset;
-                error = write_empty_blocks(page, from, to);
-                if (!error && offset + to > inode->i_size &&
-                    !(mode & FALLOC_FL_KEEP_SIZE)) {
-                        i_size_write(inode, offset + to);
-                }
-                unlock_page(page);
-                page_cache_release(page);
-                if (error)
-                        goto out;
-                curr++;
-                offset += PAGE_CACHE_SIZE;
-                from = 0;
-        }
-        gfs2_dinode_out(ip, dibh->b_data);
-        mark_inode_dirty(inode);
-        brelse(dibh);
-out:
-        return error;
-}
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-                max_data -= tmp;
-        }
-        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-           so it might end up with fewer data blocks */
-        if (max_data <= *data_blocks)
-                return;
-        *data_blocks = max_data;
-        *ind_blocks = max_blocks - max_data;
-        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-        if (*len > max) {
-                *len = max;
-                gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-        }
-}
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-                           loff_t len)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct gfs2_inode *ip = GFS2_I(inode);
-        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-        loff_t bytes, max_bytes;
-        struct gfs2_alloc *al;
-        int error;
-        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-                 sdp->sd_sb.sb_bsize_shift;
-        len = next - offset;
-        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-        if (!bytes)
-                bytes = UINT_MAX;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-        error = gfs2_glock_nq(&ip->i_gh);
-        if (unlikely(error))
-                goto out_uninit;
-        if (!gfs2_write_alloc_required(ip, offset, len))
-                goto out_unlock;
-        while (len > 0) {
-                if (len < bytes)
-                        bytes = len;
-                al = gfs2_alloc_get(ip);
-                if (!al) {
-                        error = -ENOMEM;
-                        goto out_unlock;
-                }
-                error = gfs2_quota_lock_check(ip);
-                if (error)
-                        goto out_alloc_put;
-retry:
-                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-                al->al_requested = data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip);
-                if (error) {
-                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-                                bytes >>= 1;
-                                goto retry;
-                        }
-                        goto out_qunlock;
-                }
-                max_bytes = bytes;
-                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-                al->al_requested = data_blocks + ind_blocks;
-                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-                          RES_RG_HDR + gfs2_rg_blocks(al);
-                if (gfs2_is_jdata(ip))
-                        rblocks += data_blocks ? data_blocks : 1;
-                error = gfs2_trans_begin(sdp, rblocks,
-                                         PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-                if (error)
-                        goto out_trans_fail;
-                error = fallocate_chunk(inode, offset, max_bytes, mode);
-                gfs2_trans_end(sdp);
-                if (error)
-                        goto out_trans_fail;
-                len -= max_bytes;
-                offset += max_bytes;
-                gfs2_inplace_release(ip);
-                gfs2_quota_unlock(ip);
-                gfs2_alloc_put(ip);
-        }
-        goto out_unlock;
-out_trans_fail:
-        gfs2_inplace_release(ip);
-out_qunlock:
-        gfs2_quota_unlock(ip);
-out_alloc_put:
-        gfs2_alloc_put(ip);
-out_unlock:
-        gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-        gfs2_holder_uninit(&ip->i_gh);
-        return error;
-}
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                       u64 start, u64 len)
 {
@@ -1572,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
-        .fallocate = gfs2_fallocate,
        .fiemap = gfs2_fiemap,
 };
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b4..a689901963d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        qd->qd_qb.qb_value = qp->qu_value;
        if (fdq) {
                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_warn = qp->qu_warn;
                }
                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                        qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        qd->qd_qb.qb_value = qp->qu_value;
+                }
        }
        /* Write the quota into the quota file on disk */
@@ -1497,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1508,7 +1513,7 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1566,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        /* If nothing has changed, this is a no-op */
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
        if (fdq->d_fieldmask == 0)
                goto out_i;
@@ -1619,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c..7293ea27020 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+                if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct inode *inode = &ip->i_inode;
-        struct file_ra_state ra_state;
-        struct gfs2_rgrpd *rgd;
-        unsigned int max_data = 0;
-        int error;
-        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-                /* Ignore partials */
-                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    i_size_read(inode))
-                        break;
-                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
-                        clear_rgrpdi(sdp);
-                        return error;
-                }
-        }
-        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-                if (rgd->rd_data > max_data)
-                        max_data = rgd->rd_data;
-        sdp->sd_max_rg_data = max_data;
-        sdp->sd_rindex_uptodate = 1;
-        return 0;
-}
-/**
 * gfs2_rindex_hold - Grab a lock on the rindex
 * @sdp: The GFS2 superblock
 * @ri_gh: the glock holder
@@ -963,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 *          The inode, if one has been found, in inode.
 */
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
-                           u64 skip)
 {
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        unsigned int n;
+        struct gfs2_glock *gl;
+        struct gfs2_inode *ip;
+        int error;
+        int found = 0;
-        for(;;) {
+        while (goal < rgd->rd_data) {
-                if (goal >= rgd->rd_data)
-                        break;
                down_write(&sdp->sd_log_flush_lock);
                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                return no_addr;
+                error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+                if (error)
+                        continue;
+                /* If the inode is already in cache, we can ignore it here
+                 * because the existing inode disposal code will deal with
+                 * it when all refs have gone away. Accessing gl_object like
+                 * this is not safe in general. Here it is ok because we do
+                 * not dereference the pointer, and we only need an approx
+                 * answer to whether it is NULL or not.
+                 */
+                ip = gl->gl_object;
+                if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put(gl);
+                else
+                        found++;
+                /* Limit reclaim to sensible number of tasks */
+                if (found > 2*NR_CPUS)
+                        return;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return 0;
+        return;
 }
 /**
@@ -1075,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
-                          u64 *last_unlinked)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
        int loops = 0;
        int error, rg_locked;
-        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1106,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        /* If the rg came in already locked, there's no
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                           way we can recover from a failed try_rgrp_unlink
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                           because that would require an iput which can only
-                           happen after the rgrp is unlocked. */
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
-                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1204,12 +1173,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK, unlinked;
+        u64 last_unlinked = NO_BLOCK;
+        int tries = 0;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-try_again:
        if (hold_rindex) {
                /* We need to hold the rindex unless the inode we're using is
                   the rindex itself, in which case it's already held. */
@@ -1217,32 +1186,33 @@ try_again:
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                        error = gfs2_ri_update_special(ip);
+                        error = gfs2_ri_update(ip);
+                if (error)
+                        return error;
        }
-        if (error)
+try_again:
-                return error;
+        do {
+                error = get_local_rgrp(ip, &last_unlinked);
+                /* If there is no space, flushing the log may release some */
+                if (error) {
+                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                            !sdp->sd_rindex_uptodate) {
+                                error = gfs2_ri_update(ip);
+                                if (error)
+                                        return error;
+                                goto try_again;
+                        }
+                        gfs2_log_flush(sdp, NULL);
+                }
+        } while (error && tries++ < 3);
-        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-           dinodes along the way, error will equal -EAGAIN and unlinked will
-           contains it block address. We then need to look up that inode and
-           try to free it, and try the allocation again. */
-        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (error != -EAGAIN)
+                return error;
-                        return error;
-                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-                /* regardless of whether or not gfs2_process_unlinked_inode
-                   was successful, we don't want to repeat it again. */
-                last_unlinked = unlinked;
-                gfs2_log_flush(sdp, NULL);
-                error = 0;
-                goto try_again;
        }
        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9..50c2bb04369 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430..ec73ed70bae 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
        if (error)
                goto out_truncate;
+        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
@@ -1405,11 +1406,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        return &ip->i_inode;
 }
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(gfs2_inode_cachep, inode);
 }
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
 const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a..439b61c0326 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-        struct buffer_head *dibh;
        int error;
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_trans_end;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(GFS2_SB(inode), !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41..afa66aaa223 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        int res;
-        dentry->d_op = &hfs_dentry_operations;
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e84..ad97c2d5828 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+                struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
                      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af7942..495a976a3cc 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
 * Hash a string to an integer in a case-independent way
 */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        const unsigned char *n1, *n2;
-        int len;
-        len = s1->len;
        if (len >= HFS_NAMELEN) {
-                if (s2->len < HFS_NAMELEN)
+                if (name->len < HFS_NAMELEN)
                        return 1;
                len = HFS_NAMELEN;
-        } else if (len != s2->len)
+        } else if (len != name->len)
                return 1;
-        n1 = s1->name;
+        n1 = str;
-        n2 = s2->name;
+        n2 = name->name;
        while (len--) {
                if (caseorder[*n1++] != caseorder[*n2++])
                        return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb..1b55f704fb2 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
+static void hfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfs_i_callback);
+}
 static const struct super_operations hfs_super_operations = {
        .alloc_inode    = hfs_alloc_inode,
        .destroy_inode  = hfs_destroy_inode,
@@ -422,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!root_inode)
                goto bail_no_root;
+        sb->s_d_op = &hfs_dentry_operations;
        res = -ENOMEM;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto bail_iput;
-        sb->s_root->d_op = &hfs_dentry_operations;
        /* everything's okay */
        return 0;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219a..19cf291eb91 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
 * This file contains the code to do various system dependent things.
 */
+#include <linux/namei.h>
 #include "hfs_fs.h"
 /* dentry case-handling: just lowercase everything */
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int diff;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if(!inode)
                return 1;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index d182438c7ae..5d799c13205 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
                return -ENOMEM;
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
-        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+                tree->cnid, __builtin_return_address(0));
        mutex_lock(&tree->tree_lock);
        return 0;
 }
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
-        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+        dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+                fd->tree->cnid, __builtin_return_address(0));
        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
 }
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ad57f5991eb..1cad80c789c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,7 +15,8 @@
 #define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
-int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+                u32 offset, u32 *max)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct page *page;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba7..1c42cc5b899 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 {
        __be16 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
 }
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
 {
        u8 data;
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 1);
        return data;
 }
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
 void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
 {
        __be16 v = cpu_to_be16(data);
-        // optimize later...
+        /* TODO: optimize later... */
        hfs_bnode_write(node, &v, off, 2);
 }
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                                dst_page--;
                        }
                        src -= len;
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, len);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                if (src == dst) {
                        l = min(len, (int)PAGE_CACHE_SIZE - src);
-                        memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+                        memmove(kmap(*dst_page) + src,
+                                kmap(*src_page) + src, l);
                        kunmap(*src_page);
                        set_page_dirty(*dst_page);
                        kunmap(*dst_page);
                        while ((len -= l) != 0) {
                                l = min(len, (int)PAGE_CACHE_SIZE);
-                                memmove(kmap(*++dst_page), kmap(*++src_page), l);
+                                memmove(kmap(*++dst_page),
+                                        kmap(*++src_page), l);
                                kunmap(*src_page);
                                set_page_dirty(*dst_page);
                                kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
                        do {
                                src_ptr = kmap(*src_page) + src;
                                dst_ptr = kmap(*dst_page) + dst;
-                                if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+                                if (PAGE_CACHE_SIZE - src <
+                                                PAGE_CACHE_SIZE - dst) {
                                        l = PAGE_CACHE_SIZE - src;
                                        src = 0;
                                        dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;
@@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
-                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+                hfs_bnode_write(tmp, &cnid,
+                        offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;
-        // move down?
+        /* move down? */
-        if (!node->prev && !node->next) {
+        if (!node->prev && !node->next)
-                printk(KERN_DEBUG "hfs_btree_del_level\n");
+                dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
-        }
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
@@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
        struct hfs_bnode *node;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
-             node; node = node->next_hash) {
+                        node; node = node->next_hash)
-                if (node->this == cnid) {
+                if (node->this == cnid)
                        return node;
-                }
-        }
        return NULL;
 }
@@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        loff_t off;
        if (cnid >= tree->node_count) {
-                printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+                printk(KERN_ERR "hfs: request for non-existent node "
+                                "%d in B*Tree\n",
+                        cnid);
                return NULL;
        }
@@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        } else {
                spin_unlock(&tree->hash_lock);
                kfree(node);
-                wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+                wait_event(node2->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);
@@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
-                wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+                wait_event(node->lock_wq,
+                        !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
@@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;
-        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+        desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+                        node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
@@ -556,11 +566,13 @@ node_error:
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-        //int i;
+#if 0
+        int i;
-        //for (i = 0; i < node->tree->pages_per_bnode; i++)
+        for (i = 0; i < node->tree->pages_per_bnode; i++)
-        //      if (node->page[i])
+                if (node->page[i])
-        //              page_cache_release(node->page[i]);
+                        page_cache_release(node->page[i]);
+#endif
        kfree(node);
 }
@@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
        if (node) {
                atomic_inc(&node->refcnt);
                dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
        }
 }
@@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
                int i;
                dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
-                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                        node->tree->cnid, node->this,
+                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2f39d05443e..2312de34bd4 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,7 +39,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
           !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
                retval = node->tree->max_key_len + 2;
        } else {
-                recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+                recoff = hfs_bnode_read_u16(node,
+                        node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
@@ -84,7 +85,8 @@ again:
        end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
        end_off = hfs_bnode_read_u16(node, end_rec_off);
        end_rec_off -= 2;
-        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+        dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+                rec, size, end_off, end_rec_off);
        if (size > end_rec_off - end_off) {
                if (new_node)
                        panic("not enough room!\n");
@@ -99,7 +101,9 @@ again:
        }
        node->num_recs++;
        /* write new last offset */
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        hfs_bnode_write_u16(node, end_rec_off, end_off + size);
        data_off = end_off;
        data_rec_off = end_rec_off + 2;
@@ -151,7 +155,8 @@ skip:
                if (tree->attributes & HFS_TREE_VARIDXKEYS)
                        key_len = be16_to_cpu(fd->search_key->key_len) + 2;
                else {
-                        fd->search_key->key_len = cpu_to_be16(tree->max_key_len);
+                        fd->search_key->key_len =
+                                cpu_to_be16(tree->max_key_len);
                        key_len = tree->max_key_len + 2;
                }
                goto again;
@@ -180,7 +185,8 @@ again:
                mark_inode_dirty(tree->inode);
        }
        hfs_bnode_dump(node);
-        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+        dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+                fd->record, fd->keylength + fd->entrylength);
        if (!--node->num_recs) {
                hfs_bnode_unlink(node);
                if (!node->parent)
@@ -194,7 +200,9 @@ again:
                __hfs_brec_find(node, fd);
                goto again;
        }
-        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+        hfs_bnode_write_u16(node,
+                offsetof(struct hfs_bnode_desc, num_recs),
+                node->num_recs);
        if (rec_off == end_off)
                goto skip;
@@ -364,7 +372,8 @@ again:
                newkeylen = hfs_bnode_read_u16(node, 14) + 2;
        else
                fd->keylength = newkeylen = tree->max_key_len + 2;
-        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+        dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+                rec, fd->keylength, newkeylen);
        rec_off = tree->node_size - (rec + 2) * 2;
        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -375,7 +384,7 @@ again:
                end_off = hfs_bnode_read_u16(parent, end_rec_off);
                if (end_rec_off - end_off < diff) {
-                        printk(KERN_DEBUG "hfs: splitting index node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
                        fd->bnode = parent;
                        new_node = hfs_bnode_split(fd);
                        if (IS_ERR(new_node))
@@ -383,7 +392,8 @@ again:
                        parent = fd->bnode;
                        rec = fd->record;
                        rec_off = tree->node_size - (rec + 2) * 2;
-                        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+                        end_rec_off = tree->node_size -
+                                (parent->num_recs + 1) * 2;
                }
        }
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 22e4d4e3299..21023d9f8ff 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -51,7 +51,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                goto free_inode;
        /* Load the header */
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        tree->root = be32_to_cpu(head->root);
        tree->leaf_count = be32_to_cpu(head->leaf_count);
        tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -115,7 +116,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
        tree->node_size_shift = ffs(size) - 1;
-        tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        tree->pages_per_bnode =
+                (tree->node_size + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        kunmap(page);
        page_cache_release(page);
@@ -144,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
                while ((node = tree->node_hash[i])) {
                        tree->node_hash[i] = node->next_hash;
                        if (atomic_read(&node->refcnt))
-                                printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n",
+                                printk(KERN_CRIT "hfs: node %d:%d "
-                                        node->tree->cnid, node->this, atomic_read(&node->refcnt));
+                                                "still has %d user(s)!\n",
+                                        node->tree->cnid, node->this,
+                                        atomic_read(&node->refcnt));
                        hfs_bnode_free(node);
                        tree->node_hash_cnt--;
                }
@@ -166,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
                return;
        /* Load the header */
        page = node->page[0];
-        head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+        head = (struct hfs_btree_header_rec *)(kmap(page) +
+                sizeof(struct hfs_bnode_desc));
        head->root = cpu_to_be32(tree->root);
        head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -272,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                                                tree->free_nodes--;
                                                mark_inode_dirty(tree->inode);
                                                hfs_bnode_put(node);
-                                                return hfs_bnode_create(tree, idx);
+                                                return hfs_bnode_create(tree,
+                                                        idx);
                                        }
                                }
                        }
@@ -287,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
                kunmap(*pagep);
                nidx = node->next;
                if (!nidx) {
-                        printk(KERN_DEBUG "hfs: create new bmap node...\n");
+                        dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
                        next_node = hfs_bmap_new_bmap(node, idx);
                } else
                        next_node = hfs_bnode_find(tree, nidx);
@@ -329,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                hfs_bnode_put(node);
                if (!i) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+                        printk(KERN_CRIT "hfs: unable to free bnode %u. "
+                                        "bmap not found!\n",
+                                node->this);
                        return;
                }
                node = hfs_bnode_find(tree, i);
@@ -337,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
                        return;
                if (node->type != HFS_NODE_MAP) {
                        /* panic */;
-                        printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+                        printk(KERN_CRIT "hfs: invalid bmap found! "
+                                        "(%u,%d)\n",
+                                node->this, node->type);
                        hfs_bnode_put(node);
                        return;
                }
@@ -350,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
        m = 1 << (~nidx & 7);
        byte = data[off];
        if (!(byte & m)) {
-                printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+                printk(KERN_CRIT "hfs: trying to free free bnode "
+                                "%u(%d)\n",
+                        node->this, node->type);
                kunmap(page);
                hfs_bnode_put(node);
                return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 8af45fc5b05..b4ba1b31933 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -91,7 +91,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
                perms->dev = 0;
 }
-static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+                u32 cnid, struct inode *inode)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
@@ -128,20 +129,32 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                if (cnid == inode->i_ino) {
                        hfsplus_cat_set_perms(inode, &file->permissions);
                        if (S_ISLNK(inode->i_mode)) {
-                                file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
+                                        cpu_to_be32(HFSP_SYMLINK_TYPE);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(HFSP_SYMLINK_CREATOR);
                        } else {
-                                file->user_info.fdType = cpu_to_be32(sbi->type);
+                                file->user_info.fdType =
-                                file->user_info.fdCreator = cpu_to_be32(sbi->creator);
+                                        cpu_to_be32(sbi->type);
+                                file->user_info.fdCreator =
+                                        cpu_to_be32(sbi->creator);
                        }
-                        if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+                        if (HFSPLUS_FLG_IMMUTABLE &
-                                file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+                                        (file->permissions.rootflags |
+                                        file->permissions.userflags))
+                                file->flags |=
+                                        cpu_to_be16(HFSPLUS_FILE_LOCKED);
                } else {
-                        file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
+                        file->user_info.fdType =
-                        file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+                                cpu_to_be32(HFSP_HARDLINK_TYPE);
-                        file->user_info.fdFlags = cpu_to_be16(0x100);
+                        file->user_info.fdCreator =
-                        file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR);
-                        file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
+                        file->user_info.fdFlags =
+                                cpu_to_be16(0x100);
+                        file->create_date =
+                                HFSPLUS_I(sbi->hidden_dir)->create_date;
+                        file->permissions.dev =
+                                cpu_to_be32(HFSPLUS_I(inode)->linkid);
                }
                return sizeof(*file);
        }
@@ -182,12 +195,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
                return -EIO;
        }
-        hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
+        hfsplus_cat_build_key_uni(fd->search_key,
-                                 &tmp.thread.nodeName);
+                be32_to_cpu(tmp.thread.parentID),
+                &tmp.thread.nodeName);
        return hfs_brec_find(fd);
 }
-int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+                struct qstr *str, struct inode *inode)
 {
        struct super_block *sb = dir->i_sb;
        struct hfs_find_data fd;
@@ -195,13 +210,15 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
        int entry_size;
        int err;
-        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+        dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+                str->name, cnid, inode->i_nlink);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+        entry_size = hfsplus_fill_cat_thread(sb, &entry,
+                S_ISDIR(inode->i_mode) ?
                        HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
-                        dir->i_ino, str);
+                dir->i_ino, str);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                if (!err)
@@ -227,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
        dir->i_size++;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
        hfs_find_exit(&fd);
        return 0;
@@ -249,7 +267,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
        int err, off;
        u16 type;
-        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+        dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
+                str ? str->name : NULL, cnid);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        if (!str) {
@@ -260,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                if (err)
                        goto out;
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_thread, nodeName);
                fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.length, off, 2);
                len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
-                hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len);
+                hfs_bnode_read(fd.bnode,
+                        &fd.search_key->cat.name.unicode,
+                        off + 2, len);
                fd.search_key->key_len = cpu_to_be16(6 + len);
        } else
                hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -281,7 +304,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
 #endif
-                off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork);
+                off = fd.entryoffset +
+                        offsetof(struct hfsplus_cat_file, rsrc_fork);
                hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
                hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
        }
@@ -308,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
        dir->i_size--;
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
+        hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_find_exit(&fd);
@@ -325,7 +349,8 @@ int hfsplus_rename_cat(u32 cnid,
        int entry_size, type;
        int err = 0;
-        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+        dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+                cnid, src_dir->i_ino, src_name->name,
                dst_dir->i_ino, dst_name->name);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
        dst_fd = src_fd;
@@ -353,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        dst_dir->i_size++;
        dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dst_dir);
        /* finally remove the old entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -365,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        src_dir->i_size--;
        src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(src_dir);
        /* remove old thread entry */
        hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -379,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
        /* create new thread entry */
        hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
-        entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name);
+        entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+                dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
@@ -387,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid,
                goto out;
        }
        err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+        hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+        hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
 out:
        hfs_bnode_put(dst_fd.bnode);
        hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9d59c0571f5..4df5059c25d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
-        dentry->d_op = &hfsplus_dentry_operations;
        dentry->d_fsdata = NULL;
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
@@ -66,11 +65,17 @@ again:
                        goto fail;
                }
                cnid = be32_to_cpu(entry.file.id);
-                if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+                if (entry.file.user_info.fdType ==
-                    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+                                cpu_to_be32(HFSP_HARDLINK_TYPE) &&
-                    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+                                entry.file.user_info.fdCreator ==
-                     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+                                cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-                    HFSPLUS_SB(sb)->hidden_dir) {
+                                (entry.file.create_date ==
+                                        HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+                                                create_date ||
+                                entry.file.create_date ==
+                                        HFSPLUS_I(sb->s_root->d_inode)->
+                                                create_date) &&
+                                HFSPLUS_SB(sb)->hidden_dir) {
                        struct qstr str;
                        char name[32];
@@ -83,11 +88,13 @@ again:
                                linkid = 0;
                        } else {
                                dentry->d_fsdata = (void *)(unsigned long)cnid;
-                                linkid = be32_to_cpu(entry.file.permissions.dev);
+                                linkid =
+                                        be32_to_cpu(entry.file.permissions.dev);
                                str.len = sprintf(name, "iNode%d", linkid);
                                str.name = name;
                                hfsplus_cat_build_key(sb, fd.search_key,
-                                        HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
+                                        HFSPLUS_SB(sb)->hidden_dir->i_ino,
+                                        &str);
                                goto again;
                        }
                } else if (!dentry->d_fsdata)
@@ -139,7 +146,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                filp->f_pos++;
                /* fall through */
        case 1:
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
                        printk(KERN_ERR "hfs: bad catalog folder thread\n");
                        err = -EIO;
@@ -169,14 +177,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+                        fd.entrylength);
                type = be16_to_cpu(entry.type);
                len = HFSPLUS_MAX_STRLEN;
                err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
                if (err)
                        goto out;
                if (type == HFSPLUS_FOLDER) {
-                        if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+                        if (fd.entrylength <
+                                        sizeof(struct hfsplus_cat_folder)) {
                                printk(KERN_ERR "hfs: small dir entry\n");
                                err = -EIO;
                                goto out;
@@ -202,7 +212,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-        next:
+next:
                filp->f_pos++;
                if (filp->f_pos >= inode->i_size)
                        goto out;
@@ -273,7 +283,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                HFSPLUS_I(inode)->linkid = id;
                cnid = sbi->next_cnid++;
                src_dentry->d_fsdata = (void *)(unsigned long)cnid;
-                res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
+                res = hfsplus_create_cat(cnid, src_dir,
+                        &src_dentry->d_name, inode);
                if (res)
                        /* panic? */
                        goto out;
@@ -485,6 +496,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 };
 const struct file_operations hfsplus_dir_operations = {
+        .fsync          = hfsplus_file_fsync,
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
        .unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0c9cb1820a5..52a0bcaa7b6 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,8 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
        return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
 }
-static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static void __hfsplus_ext_write_extent(struct inode *inode,
+                struct hfs_find_data *fd)
 {
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
@@ -95,24 +96,32 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
                                HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
        res = hfs_brec_find(fd);
-        if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
+        if (hip->extent_state & HFSPLUS_EXT_NEW) {
                if (res != -ENOENT)
                        return;
                hfs_brec_insert(fd, hip->cached_extents,
                                sizeof(hfsplus_extent_rec));
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        } else {
                if (res)
                        return;
                hfs_bnode_write(fd->bnode, hip->cached_extents,
                                fd->entryoffset, fd->entrylength);
-                hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+                hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
        }
+        /*
+         * We can't just use hfsplus_mark_inode_dirty here, because we
+         * also get called from hfsplus_write_inode, which should not
+         * redirty the inode.  Instead the callers have to be careful
+         * to explicily mark the inode dirty, too.
+         */
+        set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
 }
 static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-        if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
+        if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
                struct hfs_find_data fd;
                hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -144,18 +153,20 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                return -ENOENT;
        if (fd->entrylength != sizeof(hfsplus_extent_rec))
                return -EIO;
-        hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec));
+        hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+                sizeof(hfsplus_extent_rec));
        return 0;
 }
-static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+                struct inode *inode, u32 block)
 {
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;
        WARN_ON(!mutex_is_locked(&hip->extents_lock));
-        if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
+        if (hip->extent_state & HFSPLUS_EXT_DIRTY)
                __hfsplus_ext_write_extent(inode, fd);
        res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
@@ -164,10 +175,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
                                                HFSPLUS_TYPE_DATA);
        if (!res) {
                hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
-                hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
+                hip->cached_blocks =
+                        hfsplus_ext_block_count(hip->cached_extents);
        } else {
                hip->cached_start = hip->cached_blocks = 0;
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        }
        return res;
 }
@@ -197,6 +209,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res = -EIO;
        u32 ablock, dblock, mask;
+        int was_dirty = 0;
        int shift;
        /* Convert inode block to disk allocation block */
@@ -223,27 +236,37 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
                return -EIO;
        mutex_lock(&hip->extents_lock);
+        /*
+         * hfsplus_ext_read_extent will write out a cached extent into
+         * the extents btree.  In that case we may have to mark the inode
+         * dirty even for a pure read of an extent here.
+         */
+        was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
        res = hfsplus_ext_read_extent(inode, ablock);
-        if (!res) {
+        if (res) {
-                dblock = hfsplus_ext_find_block(hip->cached_extents,
-                                                ablock - hip->cached_start);
-        } else {
                mutex_unlock(&hip->extents_lock);
                return -EIO;
        }
+        dblock = hfsplus_ext_find_block(hip->cached_extents,
+                                        ablock - hip->cached_start);
        mutex_unlock(&hip->extents_lock);
 done:
-        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
+        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+                inode->i_ino, (long long)iblock, dblock);
        mask = (1 << sbi->fs_shift) - 1;
-        map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
+        map_bh(bh_result, sb,
+                (dblock << sbi->fs_shift) + sbi->blockoffset +
+                        (iblock & mask));
        if (create) {
                set_buffer_new(bh_result);
                hip->phys_size += sb->s_blocksize;
                hip->fs_blocks++;
                inode_add_bytes(inode, sb->s_blocksize);
-                mark_inode_dirty(inode);
        }
+        if (create || was_dirty)
+                mark_inode_dirty(inode);
        return 0;
 }
@@ -326,7 +349,8 @@ found:
        }
 }
-int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type)
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+                struct hfsplus_fork_raw *fork, int type)
 {
        struct hfs_find_data fd;
        hfsplus_extent_rec ext_entry;
@@ -373,12 +397,13 @@ int hfsplus_file_extend(struct inode *inode)
        u32 start, len, goal;
        int res;
-        if (sbi->alloc_file->i_size * 8 <
+        if (sbi->total_blocks - sbi->free_blocks + 8 >
-            sbi->total_blocks - sbi->free_blocks + 8) {
+                        sbi->alloc_file->i_size * 8) {
-                // extend alloc file
+                /* extend alloc file */
-                printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+                printk(KERN_ERR "hfs: extend alloc file! "
-                                sbi->alloc_file->i_size * 8,
+                                "(%llu,%u,%u)\n",
-                                sbi->total_blocks, sbi->free_blocks);
+                        sbi->alloc_file->i_size * 8,
+                        sbi->total_blocks, sbi->free_blocks);
                return -ENOSPC;
        }
@@ -429,7 +454,7 @@ int hfsplus_file_extend(struct inode *inode)
                                         start, len);
                if (!res) {
                        hfsplus_dump_extent(hip->cached_extents);
-                        hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        hip->cached_blocks += len;
                } else if (res == -ENOSPC)
                        goto insert_extent;
@@ -438,7 +463,7 @@ out:
        mutex_unlock(&hip->extents_lock);
        if (!res) {
                hip->alloc_blocks += len;
-                mark_inode_dirty(inode);
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
        }
        return res;
@@ -450,7 +475,7 @@ insert_extent:
        hip->cached_extents[0].start_block = cpu_to_be32(start);
        hip->cached_extents[0].block_count = cpu_to_be32(len);
        hfsplus_dump_extent(hip->cached_extents);
-        hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+        hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
        hip->cached_start = hip->alloc_blocks;
        hip->cached_blocks = len;
@@ -466,8 +491,9 @@ void hfsplus_file_truncate(struct inode *inode)
        u32 alloc_cnt, blk_cnt, start;
        int res;
-        dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+        dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
-                inode->i_ino, (long long)hip->phys_size, inode->i_size);
+                inode->i_ino, (long long)hip->phys_size,
+                inode->i_size);
        if (inode->i_size > hip->phys_size) {
                struct address_space *mapping = inode->i_mapping;
@@ -481,7 +507,8 @@ void hfsplus_file_truncate(struct inode *inode)
                                                &page, &fsdata);
                if (res)
                        return;
-                res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+                res = pagecache_write_end(NULL, mapping, size,
+                        0, 0, page, fsdata);
                if (res < 0)
                        return;
                mark_inode_dirty(inode);
@@ -513,12 +540,12 @@ void hfsplus_file_truncate(struct inode *inode)
                                     alloc_cnt - start, alloc_cnt - blk_cnt);
                hfsplus_dump_extent(hip->cached_extents);
                if (blk_cnt > start) {
-                        hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        break;
                }
                alloc_cnt = start;
                hip->cached_start = hip->cached_blocks = 0;
-                hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
@@ -527,7 +554,8 @@ void hfsplus_file_truncate(struct inode *inode)
        hip->alloc_blocks = blk_cnt;
 out:
        hip->phys_size = inode->i_size;
-        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
+                sb->s_blocksize_bits;
        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
-        mark_inode_dirty(inode);
+        hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57..d6857523336 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,13 +23,16 @@
 #define DBG_EXTENT      0x00000020
 #define DBG_BITMAP      0x00000040
-//#define DBG_MASK      (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#if 0
-//#define DBG_MASK      (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-//#define DBG_MASK      (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#define DBG_MASK        (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK        (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
 #define DBG_MASK        (0)
 #define dprint(flg, fmt, args...) \
-        if (flg & DBG_MASK) printk(fmt , ## args)
+        if (flg & DBG_MASK) \
+                printk(fmt , ## args)
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
@@ -37,7 +40,8 @@
 #define HFSPLUS_TYPE_DATA 0x00
 #define HFSPLUS_TYPE_RSRC 0xFF
-typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *);
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
 #define NODE_HASH_SIZE  256
@@ -61,7 +65,6 @@ struct hfs_btree {
        unsigned int max_key_len;
        unsigned int depth;
-        //unsigned int map1_size, map_size;
        struct mutex tree_lock;
        unsigned int pages_per_bnode;
@@ -107,8 +110,8 @@ struct hfsplus_vh;
 struct hfs_btree;
 struct hfsplus_sb_info {
-        struct buffer_head *s_vhbh;
        struct hfsplus_vh *s_vhdr;
+        struct hfsplus_vh *s_backup_vhdr;
        struct hfs_btree *ext_tree;
        struct hfs_btree *cat_tree;
        struct hfs_btree *attr_tree;
@@ -118,7 +121,8 @@ struct hfsplus_sb_info {
        /* Runtime variables */
        u32 blockoffset;
-        u32 sect_count;
+        sector_t part_start;
+        sector_t sect_count;
        int fs_shift;
        /* immutable data from the volume header */
@@ -155,6 +159,12 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_FORCE        2
 #define HFSPLUS_SB_HFSX         3
 #define HFSPLUS_SB_CASEFOLD     4
+#define HFSPLUS_SB_NOBARRIER    5
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
 struct hfsplus_inode_info {
@@ -170,7 +180,7 @@ struct hfsplus_inode_info {
        u32 cached_blocks;
        hfsplus_extent_rec first_extents;
        hfsplus_extent_rec cached_extents;
-        unsigned long flags;
+        unsigned int extent_state;
        struct mutex extents_lock;
        /*
@@ -185,6 +195,11 @@ struct hfsplus_inode_info {
        u32 linkid;
        /*
+         * Accessed using atomic bitops.
+         */
+        unsigned long flags;
+        /*
         * Protected by i_mutex.
         */
        sector_t fs_blocks;
@@ -195,12 +210,34 @@ struct hfsplus_inode_info {
        struct inode vfs_inode;
 };
-#define HFSPLUS_FLG_RSRC        0x0001
+#define HFSPLUS_EXT_DIRTY       0x0001
-#define HFSPLUS_FLG_EXT_DIRTY   0x0002
+#define HFSPLUS_EXT_NEW         0x0002
-#define HFSPLUS_FLG_EXT_NEW     0x0004
+#define HFSPLUS_I_RSRC          0       /* represents a resource fork */
+#define HFSPLUS_I_CAT_DIRTY     1       /* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY     2       /* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY   3       /* has changes in the allocation file */
+#define HFSPLUS_IS_RSRC(inode) \
+        test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+/*
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+                unsigned int flag)
+{
+        set_bit(flag, &HFSPLUS_I(inode)->flags);
+        mark_inode_dirty(inode);
+}
 struct hfs_find_data {
        /* filled by caller */
@@ -318,9 +355,12 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
 int hfs_brec_goto(struct hfs_find_data *, int);
 /* catalog.c */
-int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
-int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+                const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
+                const hfsplus_btree_key *);
+void hfsplus_cat_build_key(struct super_block *sb,
+                hfsplus_btree_key *, u32, struct qstr *);
 int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
 int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
@@ -336,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations;
 int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
 void hfsplus_ext_write_extent(struct inode *);
 int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
-int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int);
+int hfsplus_free_fork(struct super_block *, u32,
+                struct hfsplus_fork_raw *, int);
 int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
@@ -351,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
 int hfsplus_cat_write_inode(struct inode *);
 struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
+int hfsplus_file_fsync(struct file *file, int datasync);
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
@@ -362,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* options.c */
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options_remount(char *input, int *force);
 void hfsplus_fill_defaults(struct hfsplus_sb_info *);
 int hfsplus_show_options(struct seq_file *, struct vfsmount *);
@@ -375,45 +418,26 @@ extern u16 hfsplus_decompose_table[];
 extern u16 hfsplus_compose_table[];
 /* unicode.c */
-int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+int hfsplus_strcasecmp(const struct hfsplus_unistr *,
-int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
+                const struct hfsplus_unistr *);
-int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
+int hfsplus_strcmp(const struct hfsplus_unistr *,
-int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
+                const struct hfsplus_unistr *);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_uni2asc(struct super_block *,
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+                const struct hfsplus_unistr *, char *, int *);
+int hfsplus_asc2uni(struct super_block *,
+                struct hfsplus_unistr *, const char *, int);
+int hfsplus_hash_dentry(const struct dentry *dentry,
+                const struct inode *inode, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
-/* access macros */
+                void *data, int rw);
-static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
-{
-        return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
-}
-#define sb_bread512(sb, sec, data) ({                   \
-        struct buffer_head *__bh;                       \
-        sector_t __block;                               \
-        loff_t __start;                                 \
-        int __offset;                                   \
-                                                        \
-        __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
-        __block = __start >> (sb)->s_blocksize_bits;    \
-        __offset = __start & ((sb)->s_blocksize - 1);   \
-        __bh = sb_bread((sb), __block);                 \
-        if (likely(__bh != NULL))                       \
-                data = (void *)(__bh->b_data + __offset);\
-        else                                            \
-                data = NULL;                            \
-        __bh;                                           \
-})
 /* time macros */
 #define __hfsp_mt2ut(t)         (be32_to_cpu(t) - 2082844800U)
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 6892899fd6f..927cdd6d5bf 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
 #define HFSP_WRAPOFF_EMBEDSIG     0x7C
 #define HFSP_WRAPOFF_EMBEDEXT     0x7E
-#define HFSP_HIDDENDIR_NAME     "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+#define HFSP_HIDDENDIR_NAME \
+        "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
 #define HFSP_HARDLINK_TYPE      0x686c6e6b      /* 'hlnk' */
 #define HFSP_HFSPLUS_CREATOR    0x6866732b      /* 'hfs+' */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8afd7e84f98..a8df651747f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
 * Inode handling routines
 */
+#include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
        if (!tree)
                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
-                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+                nidx = page->index >>
+                        (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
@@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
                }
                spin_unlock(&tree->hash_lock);
        } else {
-                nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+                nidx = page->index <<
+                        (PAGE_CACHE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
@@ -166,8 +169,8 @@ const struct dentry_operations hfsplus_dentry_operations = {
        .d_compare    = hfsplus_compare_dentry,
 };
-static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *hfsplus_file_lookup(struct inode *dir,
-                                          struct nameidata *nd)
+                struct dentry *dentry, struct nameidata *nd)
 {
        struct hfs_find_data fd;
        struct super_block *sb = dir->i_sb;
@@ -190,7 +193,9 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
        inode->i_ino = dir->i_ino;
        INIT_LIST_HEAD(&hip->open_dir_list);
        mutex_init(&hip->extents_lock);
-        hip->flags = HFSPLUS_FLG_RSRC;
+        hip->extent_state = 0;
+        hip->flags = 0;
+        set_bit(HFSPLUS_I_RSRC, &hip->flags);
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        err = hfsplus_find_cat(sb, dir->i_ino, &fd);
@@ -219,7 +224,8 @@ out:
        return NULL;
 }
-static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
+static void hfsplus_get_perms(struct inode *inode,
+                struct hfsplus_perm *perms, int dir)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        u16 mode;
@@ -302,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
-static int hfsplus_file_fsync(struct file *filp, int datasync)
+int hfsplus_file_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = filp->f_mapping->host;
+        struct inode *inode = file->f_mapping->host;
-        struct super_block * sb;
+        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
-        int ret, err;
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+        int error = 0, error2;
-        /* sync the inode to buffers */
-        ret = write_inode_now(inode, 0);
+        /*
+         * Sync inode metadata into the catalog and extent trees.
-        /* sync the superblock to buffers */
+         */
-        sb = inode->i_sb;
+        sync_inode_metadata(inode, 1);
-        if (sb->s_dirt) {
-                if (!(sb->s_flags & MS_RDONLY))
+        /*
-                        hfsplus_sync_fs(sb, 1);
+         * And explicitly write out the btrees.
-                else
+         */
-                        sb->s_dirt = 0;
+        if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+                error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+        if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+                error2 =
+                        filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+                if (!error)
+                        error = error2;
        }
-        /* .. finally sync the buffers to disk */
+        if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
-        err = sync_blockdev(sb->s_bdev);
+                error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
-        if (!ret)
+                if (!error)
-                ret = err;
+                        error = error2;
-        return ret;
+        }
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static const struct inode_operations hfsplus_file_inode_operations = {
@@ -337,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 };
 static const struct file_operations hfsplus_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .write          = do_sync_write,
@@ -370,6 +388,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        INIT_LIST_HEAD(&hip->open_dir_list);
        mutex_init(&hip->extents_lock);
        atomic_set(&hip->opencnt, 0);
+        hip->extent_state = 0;
        hip->flags = 0;
        memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
@@ -457,7 +476,8 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
        }
 }
-void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+void hfsplus_inode_write_fork(struct inode *inode,
+                struct hfsplus_fork_raw *fork)
 {
        memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
               sizeof(hfsplus_extent_rec));
@@ -499,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_file));
-                hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ?
+                hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
-                                        &file->data_fork : &file->rsrc_fork);
+                                        &file->rsrc_fork : &file->data_fork);
                hfsplus_get_perms(inode, &file->permissions, 0);
                inode->i_nlink = 1;
                if (S_ISREG(inode->i_mode)) {
                        if (file->permissions.dev)
-                                inode->i_nlink = be32_to_cpu(file->permissions.dev);
+                                inode->i_nlink =
+                                        be32_to_cpu(file->permissions.dev);
                        inode->i_op = &hfsplus_file_inode_operations;
                        inode->i_fop = &hfsplus_file_operations;
                        inode->i_mapping->a_ops = &hfsplus_aops;
@@ -578,7 +599,9 @@ int hfsplus_cat_write_inode(struct inode *inode)
                                        sizeof(struct hfsplus_cat_file));
                hfsplus_inode_write_fork(inode, &file->data_fork);
                hfsplus_cat_set_perms(inode, &file->permissions);
-                if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+                if (HFSPLUS_FLG_IMMUTABLE &
+                                (file->permissions.rootflags |
+                                        file->permissions.userflags))
                        file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                else
                        file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
@@ -588,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
        }
+        set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
 out:
        hfs_find_exit(&fd);
        return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 40a85a3ded6..508ce662ce1 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -28,7 +28,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
        if (inode->i_flags & S_IMMUTABLE)
                flags |= FS_IMMUTABLE_FL;
-        if (inode->i_flags |= S_APPEND)
+        if (inode->i_flags & S_APPEND)
                flags |= FS_APPEND_FL;
        if (hip->userflags & HFSPLUS_FLG_NODUMP)
                flags |= FS_NODUMP_FL;
@@ -147,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
                        res = -ERANGE;
        } else
                res = -EOPNOTSUPP;
-        if (!res)
+        if (!res) {
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                sizeof(struct hfsplus_cat_file));
+                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+        }
 out:
        hfs_find_exit(&fd);
        return res;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index f9ab276a4d8..bb62a588214 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
        opt_umask, opt_uid, opt_gid,
        opt_part, opt_session, opt_nls,
        opt_nodecompose, opt_decompose,
+        opt_barrier, opt_nobarrier,
        opt_force, opt_err
 };
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
        { opt_nls, "nls=%s" },
        { opt_decompose, "decompose" },
        { opt_nodecompose, "nodecompose" },
+        { opt_barrier, "barrier" },
+        { opt_nobarrier, "nobarrier" },
        { opt_force, "force" },
        { opt_err, NULL }
 };
@@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
        return 0;
 }
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        if (!input)
+                return 0;
+        while ((p = strsep(&input, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case opt_force:
+                        *force = 1;
+                        break;
+                default:
+                        break;
+                }
+        }
+        return 1;
+}
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
 int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
@@ -136,7 +165,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                        if (p)
                                sbi->nls = load_nls(p);
                        if (!sbi->nls) {
-                                printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
+                                printk(KERN_ERR "hfs: unable to load "
+                                                "nls mapping \"%s\"\n",
+                                        p);
                                kfree(p);
                                return 0;
                        }
@@ -148,6 +179,12 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                case opt_nodecompose:
                        set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                        break;
+                case opt_barrier:
+                        clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+                        break;
+                case opt_nobarrier:
+                        set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+                        break;
                case opt_force:
                        set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                        break;
@@ -177,7 +214,8 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
                seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
-        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+                sbi->uid, sbi->gid);
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
@@ -186,5 +224,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",nls=%s", sbi->nls->charset);
        if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                seq_printf(seq, ",nodecompose");
+        if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                seq_printf(seq, ",nobarrier");
        return 0;
 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 208b16c645c..d66ad113b1c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
 * linux/fs/hfsplus/part_tbl.c
 *
 * Copyright (C) 1996-1997  Paul H. Hargrove
- * This file may be distributed under the terms of the GNU General Public License.
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
 *
 * Original code to handle the new style Mac partition table based on
 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
@@ -13,6 +14,7 @@
 *
 */
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 /* offsets to various blocks */
@@ -58,77 +60,94 @@ struct new_pmap {
 */
 struct old_pmap {
        __be16          pdSig;  /* Signature bytes */
-        struct  old_pmap_entry {
+        struct old_pmap_entry {
                __be32  pdStart;
                __be32  pdSize;
                __be32  pdFSID;
        }       pdEntry[42];
 } __packed;
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int i;
+        for (i = 0; i < 42; i++) {
+                struct old_pmap_entry *p = &pm->pdEntry[i];
+                if (p->pdStart && p->pdSize &&
+                    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(p->pdStart);
+                        *part_size = be32_to_cpu(p->pdSize);
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
+                sector_t *part_start, sector_t *part_size)
+{
+        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        int size = be32_to_cpu(pm->pmMapBlkCnt);
+        int res;
+        int i = 0;
+        do {
+                if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
+                    (sbi->part < 0 || sbi->part == i)) {
+                        *part_start += be32_to_cpu(pm->pmPyPartStart);
+                        *part_size = be32_to_cpu(pm->pmPartBlkCnt);
+                        return 0;
+                }
+                if (++i >= size)
+                        return -ENOENT;
+                res = hfsplus_submit_bio(sb->s_bdev,
+                                         *part_start + HFS_PMAP_BLK + i,
+                                         pm, READ);
+                if (res)
+                        return res;
+        } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+        return -ENOENT;
+}
 /*
- * hfs_part_find()
+ * Parse the partition map looking for the start and length of a
- *
+ * HFS/HFS+ partition.
- * Parse the partition map looking for the
- * start and length of the 'part'th HFS partition.
 */
 int hfs_part_find(struct super_block *sb,
-                  sector_t *part_start, sector_t *part_size)
+                sector_t *part_start, sector_t *part_size)
 {
-        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+        void *data;
-        struct buffer_head *bh;
+        int res;
-        __be16 *data;
-        int i, size, res;
+        data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
-        res = -ENOENT;
+        res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
-        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
+                                 data, READ);
-        if (!bh)
+        if (res)
-                return -EIO;
+                return res;
-        switch (be16_to_cpu(*data)) {
+        switch (be16_to_cpu(*((__be16 *)data))) {
        case HFS_OLD_PMAP_MAGIC:
-          {
+                res = hfs_parse_old_pmap(sb, data, part_start, part_size);
-                struct old_pmap *pm;
-                struct old_pmap_entry *p;
-                pm = (struct old_pmap *)bh->b_data;
-                p = pm->pdEntry;
-                size = 42;
-                for (i = 0; i < size; p++, i++) {
-                        if (p->pdStart && p->pdSize &&
-                            p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-                            (sbi->part < 0 || sbi->part == i)) {
-                                *part_start += be32_to_cpu(p->pdStart);
-                                *part_size = be32_to_cpu(p->pdSize);
-                                res = 0;
-                        }
-                }
                break;
-          }
        case HFS_NEW_PMAP_MAGIC:
-          {
+                res = hfs_parse_new_pmap(sb, data, part_start, part_size);
-                struct new_pmap *pm;
+                break;
+        default:
-                pm = (struct new_pmap *)bh->b_data;
+                res = -ENOENT;
-                size = be32_to_cpu(pm->pmMapBlkCnt);
-                for (i = 0; i < size;) {
-                        if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-                            (sbi->part < 0 || sbi->part == i)) {
-                                *part_start += be32_to_cpu(pm->pmPyPartStart);
-                                *part_size = be32_to_cpu(pm->pmPartBlkCnt);
-                                res = 0;
-                                break;
-                        }
-                        brelse(bh);
-                        bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
-                        if (!bh)
-                                return -EIO;
-                        if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
-                                break;
-                }
                break;
-          }
        }
-        brelse(bh);
+        kfree(data);
        return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba..9a3b4795f43 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
@@ -66,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
        INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
        mutex_init(&HFSPLUS_I(inode)->extents_lock);
        HFSPLUS_I(inode)->flags = 0;
+        HFSPLUS_I(inode)->extent_state = 0;
        HFSPLUS_I(inode)->rsrc_inode = NULL;
        atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
@@ -157,45 +159,65 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_vh *vhdr = sbi->s_vhdr;
+        int write_backup = 0;
+        int error, error2;
+        if (!wait)
+                return 0;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
-        mutex_lock(&sbi->vh_mutex);
-        mutex_lock(&sbi->alloc_mutex);
        sb->s_dirt = 0;
+        /*
+         * Explicitly write out the special metadata inodes.
+         *
+         * While these special inodes are marked as hashed and written
+         * out peridocically by the flusher threads we redirty them
+         * during writeout of normal inodes, and thus the life lock
+         * prevents us from getting the latest state to disk.
+         */
+        error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+        error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+        if (!error)
+                error = error2;
+        error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+        if (!error)
+                error = error2;
+        mutex_lock(&sbi->vh_mutex);
+        mutex_lock(&sbi->alloc_mutex);
        vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
        vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
        vhdr->folder_count = cpu_to_be32(sbi->folder_count);
        vhdr->file_count = cpu_to_be32(sbi->file_count);
-        mark_buffer_dirty(sbi->s_vhbh);
        if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
-                if (sbi->sect_count) {
+                memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
-                        struct buffer_head *bh;
+                write_backup = 1;
-                        u32 block, offset;
-                        block = sbi->blockoffset;
-                        block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
-                        offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
-                        printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
-                                          sbi->blockoffset, sbi->sect_count,
-                                          block, offset);
-                        bh = sb_bread(sb, block);
-                        if (bh) {
-                                vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
-                                if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-                                        memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
-                                        mark_buffer_dirty(bh);
-                                        brelse(bh);
-                                } else
-                                        printk(KERN_WARNING "hfs: backup not found!\n");
-                        }
-                }
        }
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, WRITE_SYNC);
+        if (!error)
+                error = error2;
+        if (!write_backup)
+                goto out;
+        error2 = hfsplus_submit_bio(sb->s_bdev,
+                                  sbi->part_start + sbi->sect_count - 2,
+                                  sbi->s_backup_vhdr, WRITE_SYNC);
+        if (!error)
+                error2 = error;
+out:
        mutex_unlock(&sbi->alloc_mutex);
        mutex_unlock(&sbi->vh_mutex);
-        return 0;
+        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+                blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+        return error;
 }
 static void hfsplus_write_super(struct super_block *sb)
@@ -215,23 +237,22 @@ static void hfsplus_put_super(struct super_block *sb)
        if (!sb->s_fs_info)
                return;
-        if (sb->s_dirt)
-                hfsplus_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
                struct hfsplus_vh *vhdr = sbi->s_vhdr;
                vhdr->modify_date = hfsp_now2mt();
                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-                mark_buffer_dirty(sbi->s_vhbh);
-                sync_dirty_buffer(sbi->s_vhbh);
+                hfsplus_sync_fs(sb, 1);
        }
        hfs_btree_close(sbi->cat_tree);
        hfs_btree_close(sbi->ext_tree);
        iput(sbi->alloc_file);
        iput(sbi->hidden_dir);
-        brelse(sbi->s_vhbh);
+        kfree(sbi->s_vhdr);
+        kfree(sbi->s_backup_vhdr);
        unload_nls(sbi->nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
@@ -263,26 +284,31 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
                return 0;
        if (!(*flags & MS_RDONLY)) {
                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
-                struct hfsplus_sb_info sbi;
+                int force = 0;
-                memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
+                if (!hfsplus_parse_options_remount(data, &force))
-                sbi.nls = HFSPLUS_SB(sb)->nls;
-                if (!hfsplus_parse_options(data, &sbi))
                        return -EINVAL;
                if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                        printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
+                        printk(KERN_WARNING "hfs: filesystem was "
-                               "running fsck.hfsplus is recommended.  leaving read-only.\n");
+                                        "not cleanly unmounted, "
+                                        "running fsck.hfsplus is recommended.  "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
+                } else if (force) {
                        /* nothing */
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+                        printk(KERN_WARNING "hfs: filesystem is marked locked, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
-                } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                } else if (vhdr->attributes &
-                        printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n");
+                                cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+                        printk(KERN_WARNING "hfs: filesystem is "
+                                        "marked journaled, "
+                                        "leaving read-only.\n");
                        sb->s_flags |= MS_RDONLY;
                        *flags |= MS_RDONLY;
                }
@@ -372,17 +398,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-                printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
+                printk(KERN_WARNING "hfs: Filesystem was "
-                       "running fsck.hfsplus is recommended.  mounting read-only.\n");
+                                "not cleanly unmounted, "
+                                "running fsck.hfsplus is recommended.  "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
        } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                /* nothing */
        } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
-        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
+        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
-                printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
+                        !(sb->s_flags & MS_RDONLY)) {
-                       "use the force option at your own risk, mounting read-only.\n");
+                printk(KERN_WARNING "hfs: write access to "
+                                "a journaled filesystem is not supported, "
+                                "use the force option at your own risk, "
+                                "mounting read-only.\n");
                sb->s_flags |= MS_RDONLY;
        }
@@ -413,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                err = PTR_ERR(root);
                goto cleanup;
        }
+        sb->s_d_op = &hfsplus_dentry_operations;
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                iput(root);
                err = -ENOMEM;
                goto cleanup;
        }
-        sb->s_root->d_op = &hfsplus_dentry_operations;
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
@@ -449,19 +480,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        be32_add_cpu(&vhdr->write_count, 1);
        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-        mark_buffer_dirty(sbi->s_vhbh);
+        hfsplus_sync_fs(sb, 1);
-        sync_dirty_buffer(sbi->s_vhbh);
        if (!sbi->hidden_dir) {
-                printk(KERN_DEBUG "hfs: create hidden dir...\n");
                mutex_lock(&sbi->vh_mutex);
                sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
                hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
                                   &str, sbi->hidden_dir);
                mutex_unlock(&sbi->vh_mutex);
-                mark_inode_dirty(sbi->hidden_dir);
+                hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
        }
 out:
        unload_nls(sbi->nls);
@@ -488,11 +516,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfsplus_destroy_inode(struct inode *inode)
+static void hfsplus_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882..a3f0bfcc881 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
 /* Returns folded char, or 0 if ignorable */
 static inline u16 case_fold(u16 c)
 {
-        u16 tmp;
+        u16 tmp;
-        tmp = hfsplus_case_fold_table[c >> 8];
+        tmp = hfsplus_case_fold_table[c >> 8];
-        if (tmp)
+        if (tmp)
-                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
-        else
+        else
-                tmp = c;
+                tmp = c;
-        return tmp;
+        return tmp;
 }
 /* Compare unicode strings, return values like normal strcmp */
@@ -118,7 +118,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
        return NULL;
 }
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
+int hfsplus_uni2asc(struct super_block *sb,
+                const struct hfsplus_unistr *ustr,
+                char *astr, int *len_p)
 {
        const hfsplus_unichr *ip;
        struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto same;
                        c1 = be16_to_cpu(*ip);
                        if (likely(compose))
-                                ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1);
+                                ce1 = hfsplus_compose_lookup(
+                                        hfsplus_compose_table, c1);
                        if (ce1)
                                break;
                        switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                if (ce2) {
                        i = 1;
                        while (i < ustrlen) {
-                                ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i]));
+                                ce1 = hfsplus_compose_lookup(ce2,
+                                        be16_to_cpu(ip[i]));
                                if (!ce1)
                                        break;
                                i++;
@@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                                goto done;
                        }
                }
-        same:
+same:
                switch (c0) {
                case 0:
                        cc = 0x2400;
@@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
                default:
                        cc = c0;
                }
-        done:
+done:
                res = nls->uni2char(cc, op, len);
                if (res < 0) {
                        if (res == -ENAMETOOLONG)
@@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        struct super_block *sb = dentry->d_sb;
        const char *astr;
@@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct super_block *sb = dentry->d_sb;
+        struct super_block *sb = parent->d_sb;
        int casefold, decompose, size;
        int dsize1, dsize2, len1, len2;
        const u16 *dstr1, *dstr2;
@@ -375,10 +383,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
        casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-        astr1 = s1->name;
+        astr1 = str;
-        len1 = s1->len;
+        len1 = len;
-        astr2 = s2->name;
+        astr2 = name->name;
-        len2 = s2->len;
+        len2 = name->len;
        dsize1 = dsize2 = 0;
        dstr1 = dstr2 = NULL;
@@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr1 += size;
                        len1 -= size;
-                        if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
+                        if (decompose)
+                                dstr1 = decompose_unichar(c, &dsize1);
+                        if (!decompose || !dstr1) {
                                c1 = c;
                                dstr1 = &c1;
                                dsize1 = 1;
@@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
                        astr2 += size;
                        len2 -= size;
-                        if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
+                        if (decompose)
+                                dstr2 = decompose_unichar(c, &dsize2);
+                        if (!decompose || !dstr2) {
                                c2 = c;
                                dstr2 = &c2;
                                dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 8972c20b321..196231794f6 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
        u16 embed_count;
 };
+static void hfsplus_end_io_sync(struct bio *bio, int err)
+{
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        complete(bio->bi_private);
+}
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
+                void *data, int rw)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOIO, 1);
+        bio->bi_sector = sector;
+        bio->bi_bdev = bdev;
+        bio->bi_end_io = hfsplus_end_io_sync;
+        bio->bi_private = &wait;
+        /*
+         * We always submit one sector at a time, so bio_add_page must not fail.
+         */
+        if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
+                         offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
+                BUG();
+        submit_bio(rw, bio);
+        wait_for_completion(&wait);
+        if (!bio_flagged(bio, BIO_UPTODATE))
+                return -EIO;
+        return 0;
+}
 static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 {
        u32 extent;
@@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
           !(attrib & HFSP_WRAP_ATTRIB_SPARED))
                return 0;
-        wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+        wd->ablk_size =
+                be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
        if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
                return 0;
        if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
                return 0;
-        wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+        wd->ablk_start =
+                be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
        extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
        wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -68,7 +104,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
        if (HFSPLUS_SB(sb)->session >= 0) {
                te.cdte_track = HFSPLUS_SB(sb)->session;
                te.cdte_format = CDROM_LBA;
-                res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+                res = ioctl_by_bdev(sb->s_bdev,
+                        CDROMREADTOCENTRY, (unsigned long)&te);
                if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
                        *start = (sector_t)te.cdte_addr.lba << 2;
                        return 0;
@@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
                return -EINVAL;
        }
        ms_info.addr_format = CDROM_LBA;
-        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+        res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+                (unsigned long)&ms_info);
        if (!res && ms_info.xa_flag)
                *start = (sector_t)ms_info.addr.lba << 2;
        return 0;
@@ -88,100 +126,112 @@ static int hfsplus_get_last_session(struct super_block *sb,
 int hfsplus_read_wrapper(struct super_block *sb)
 {
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
-        struct buffer_head *bh;
-        struct hfsplus_vh *vhdr;
        struct hfsplus_wd wd;
        sector_t part_start, part_size;
        u32 blocksize;
+        int error = 0;
+        error = -EINVAL;
        blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
        if (!blocksize)
-                return -EINVAL;
+                goto out;
        if (hfsplus_get_last_session(sb, &part_start, &part_size))
-                return -EINVAL;
+                goto out;
        if ((u64)part_start + part_size > 0x100000000ULL) {
                pr_err("hfs: volumes larger than 2TB are not supported yet\n");
-                return -EINVAL;
+                goto out;
        }
-        while (1) {
-                bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                if (!bh)
-                        return -EIO;
-                if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
-                        if (!hfsplus_read_mdb(vhdr, &wd))
-                                goto error;
-                        wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
-                        part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
-                        part_size = wd.embed_count * wd.ablk_size;
-                        brelse(bh);
-                        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-                        if (!bh)
-                                return -EIO;
-                }
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-                        break;
-                if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-                        set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
-                        break;
-                }
-                brelse(bh);
-                /* check for a partition block
+        error = -ENOMEM;
+        sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_vhdr)
+                goto out;
+        sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+        if (!sbi->s_backup_vhdr)
+                goto out_free_vhdr;
+reread:
+        error = hfsplus_submit_bio(sb->s_bdev,
+                                   part_start + HFSPLUS_VOLHEAD_SECTOR,
+                                   sbi->s_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        switch (sbi->s_vhdr->signature) {
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+                set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+                /*FALLTHRU*/
+        case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+                break;
+        case cpu_to_be16(HFSP_WRAP_MAGIC):
+                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+                        goto out;
+                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+                part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
+                part_size = wd.embed_count * wd.ablk_size;
+                goto reread;
+        default:
+                /*
+                 * Check for a partition block.
+                 *
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
-                        return -EINVAL;
+                        goto out;
+                goto reread;
+        }
+        error = hfsplus_submit_bio(sb->s_bdev,
+                                   part_start + part_size - 2,
+                                   sbi->s_backup_vhdr, READ);
+        if (error)
+                goto out_free_backup_vhdr;
+        error = -EINVAL;
+        if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+                printk(KERN_WARNING
+                        "hfs: invalid secondary volume header\n");
+                goto out_free_backup_vhdr;
        }
-        blocksize = be32_to_cpu(vhdr->blocksize);
+        blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
-        brelse(bh);
-        /* block size must be at least as large as a sector
+        /*
-         * and a multiple of 2
+         * Block size must be at least as large as a sector and a multiple of 2.
         */
-        if (blocksize < HFSPLUS_SECTOR_SIZE ||
+        if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
-            ((blocksize - 1) & blocksize))
+                goto out_free_backup_vhdr;
-                return -EINVAL;
        sbi->alloc_blksz = blocksize;
        sbi->alloc_blksz_shift = 0;
        while ((blocksize >>= 1) != 0)
                sbi->alloc_blksz_shift++;
        blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
-        /* align block size to block offset */
+        /*
+         * Align block size to block offset.
+         */
        while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
                blocksize >>= 1;
        if (sb_set_blocksize(sb, blocksize) != blocksize) {
-                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
+                printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
-                return -EINVAL;
+                        blocksize);
+                goto out_free_backup_vhdr;
        }
        sbi->blockoffset =
                part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+        sbi->part_start = part_start;
        sbi->sect_count = part_size;
        sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
-        bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-        if (!bh)
-                return -EIO;
-        /* should still be the same... */
-        if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
-                if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
-                        goto error;
-        } else {
-                if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-                        goto error;
-        }
-        sbi->s_vhbh = bh;
-        sbi->s_vhdr = vhdr;
        return 0;
- error:
-        brelse(bh);
+out_free_backup_vhdr:
-        return -EINVAL;
+        kfree(sbi->s_backup_vhdr);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+out:
+        return error;
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e..2638c834ed2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
-        char *p = __dentry_path(dentry, name, PATH_MAX);
+        char *p = dentry_path_raw(dentry, name, PATH_MAX);
        char *root;
        size_t len;
-        spin_unlock(&dcache_lock);
        root = dentry->d_sb->s_fs_info;
        len = strlen(root);
        if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
        if (!name)
                return NULL;
-        spin_lock(&dcache_lock);
        return __dentry_name(dentry, name); /* will unlock */
 }
 static char *inode_name(struct inode *ino)
 {
        struct dentry *dentry;
-        char *name = __getname();
+        char *name;
-        if (!name)
-                return NULL;
-        spin_lock(&dcache_lock);
+        dentry = d_find_alias(ino);
-        if (list_empty(&ino->i_dentry)) {
+        if (!dentry)
-                spin_unlock(&dcache_lock);
-                __putname(name);
                return NULL;
-        }
-        dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+        name = dentry_name(dentry);
-        return __dentry_name(dentry, name); /* will unlock */
+        dput(dentry);
+        return name;
 }
 static char *follow_link(char *link)
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
        }
 }
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HOSTFS_I(inode));
 }
+static void hostfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out_put;
        d_add(dentry, inode);
-        dentry->d_op = &hostfs_dentry_ops;
        return NULL;
 out_put:
@@ -746,11 +748,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (desired & MAY_READ) r = 1;
        if (desired & MAY_WRITE) w = 1;
        if (desired & MAY_EXEC) x = 1;
@@ -765,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired)
                err = access_file(name, r, w, x);
        __putname(name);
        if (!err)
-                err = generic_permission(ino, desired, NULL);
+                err = generic_permission(ino, desired, flags, NULL);
        return err;
 }
@@ -916,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_d_op = &hostfs_dentry_ops;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5..05d4816e4e7 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
 * Note: the dentry argument is the parent dentry.
 */
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        unsigned long    hash;
        int              i;
@@ -34,29 +35,30 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
        return 0;
 }
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        unsigned al=a->len;
+        unsigned al = len;
-        unsigned bl=b->len;
+        unsigned bl = name->len;
-        hpfs_adjust_length(a->name, &al);
+        hpfs_adjust_length(str, &al);
        /*hpfs_adjust_length(b->name, &bl);*/
-        /* 'a' is the qstr of an already existing dentry, so the name
-         * must be valid. 'b' must be validated first.
+        /*
+         * 'str' is the nane of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (hpfs_chk_name(b->name, &bl))
+        if (hpfs_chk_name(name->name, &bl))
                return 1;
-        if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+        if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
                return 1;
        return 0;
 }
-static const struct dentry_operations hpfs_dentry_operations = {
+const struct dentry_operations hpfs_dentry_operations = {
        .d_hash         = hpfs_hash_dentry,
        .d_compare      = hpfs_compare_dentry,
 };
-void hpfs_set_dentry_operations(struct dentry *dentry)
-{
-        dentry->d_op = &hpfs_dentry_operations;
-}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceb..d32f63a569f 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        hpfs_set_dentry_operations(dentry);
        unlock_kernel();
        d_add(dentry, result);
        return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9a..1c43dbea55e 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
 /* dentry.c */
-void hpfs_set_dentry_operations(struct dentry *);
+extern const struct dentry_operations hpfs_dentry_operations;
 /* dir.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd1..1ae35baa539 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
            attr->ia_size != i_size_read(inode)) {
                error = vmtruncate(inode, attr->ia_size);
                if (error)
-                        return error;
+                        goto out_unlock;
        }
        setattr_copy(inode, attr);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f6..f4ad9e31ddc 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
                        unlock_kernel();
                        return -ENOSPC;
                }
-                if (generic_permission(inode, MAY_WRITE, NULL) ||
+                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3..b30426b1fc9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
+static void hpfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -543,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        /* Fill superblock stuff */
        s->s_magic = HPFS_SUPER_MAGIC;
        s->s_op = &hpfs_sops;
+        s->s_d_op = &hpfs_dentry_operations;
        sbi->sb_root = superblock->root;
        sbi->sb_fs_size = superblock->n_sectors;
@@ -644,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                iput(root);
                goto bail0;
        }
-        hpfs_set_dentry_operations(s->s_root);
        /*
         * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713f..87ed48e0343 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
        mntput(ino->i_sb->s_fs_info);
 }
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HPPFS_I(inode));
 }
+static void hppfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a4..9885082b470 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
        mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+        call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 static const struct address_space_operations hugetlbfs_aops = {
@@ -932,8 +939,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
-                        WARN_ONCE(1,
+                        printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
-                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
                } else {
                        *user = NULL;
                        return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index ae2727ab0c3..da85e56378f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,26 +102,29 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes);
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_inodes, i);
+        return sum < 0 ? 0 : sum;
 }
 static inline int get_nr_inodes_unused(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes_unused);
+        return inodes_stat.nr_unused;
 }
 int get_nr_dirty_inodes(void)
 {
+        /* not actually dirty inodes, but a wild approximation */
        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
 }
 /*
@@ -132,7 +135,6 @@ int proc_nr_inodes(ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        inodes_stat.nr_inodes = get_nr_inodes();
-        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -224,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
-        percpu_counter_inc(&nr_inodes);
+        this_cpu_inc(nr_inodes);
        return 0;
 out:
@@ -255,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
        return inode;
 }
+void free_inode_nonrcu(struct inode *inode)
+{
+        kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
@@ -266,10 +274,17 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
-        percpu_counter_dec(&nr_inodes);
+        this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
+static void i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(inode_cachep, inode);
+}
 static void destroy_inode(struct inode *inode)
 {
        BUG_ON(!list_empty(&inode->i_lru));
@@ -277,7 +292,7 @@ static void destroy_inode(struct inode *inode)
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
-                kmem_cache_free(inode_cachep, (inode));
+                call_rcu(&inode->i_rcu, i_callback);
 }
 /*
@@ -335,7 +350,7 @@ static void inode_lru_list_add(struct inode *inode)
 {
        if (list_empty(&inode->i_lru)) {
                list_add(&inode->i_lru, &inode_lru);
-                percpu_counter_inc(&nr_inodes_unused);
+                inodes_stat.nr_unused++;
        }
 }
@@ -343,7 +358,7 @@ static void inode_lru_list_del(struct inode *inode)
 {
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
 }
@@ -430,6 +445,7 @@ void end_writeback(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
+        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
@@ -513,7 +529,7 @@ void evict_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -554,7 +570,7 @@ int invalidate_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -616,7 +632,7 @@ static void prune_icache(int nr_to_scan)
                if (atomic_read(&inode->i_count) ||
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
                        continue;
                }
@@ -650,7 +666,7 @@ static void prune_icache(int nr_to_scan)
                 */
                list_move(&inode->i_lru, &freeable);
                list_del_init(&inode->i_wb_list);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1664,6 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
-        percpu_counter_init(&nr_inodes, 0);
-        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4..0663568b124 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,12 +63,17 @@ extern int copy_mount_string(const void __user *, char **);
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
 extern void __init mnt_init(void);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3..a59635e295f 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -87,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
 {
        struct fiemap_extent extent;
-        struct fiemap_extent *dest = fieinfo->fi_extents_start;
+        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
@@ -174,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
        struct fiemap fiemap;
+        struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct super_block *sb = inode->i_sb;
@@ -183,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;
-        if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
-                           sizeof(struct fiemap)))
                return -EFAULT;
        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -197,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
-        fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+        fieinfo.fi_extents_start = ufiemap->fm_extents;
        if (fiemap.fm_extent_count != 0 &&
            !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -210,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-        if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;
        return error;
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
-static int ioctl_fstrim(struct file *filp, void __user *argp)
-{
-        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
-        struct fstrim_range range;
-        int ret = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        /* If filesystem doesn't support trim feature, return. */
-        if (sb->s_op->trim_fs == NULL)
-                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
-        if (argp == NULL) {
-                range.start = 0;
-                range.len = ULLONG_MAX;
-                range.minlen = 0;
-        } else if (copy_from_user(&range, argp, sizeof(range)))
-                return -EFAULT;
-        ret = sb->s_op->trim_fs(sb, &range);
-        if (ret < 0)
-                return ret;
-        if ((argp != NULL) &&
-            (copy_to_user(argp, &range, sizeof(range))))
-                return -EFAULT;
-        return 0;
-}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
-        case FITRIM:
-                error = ioctl_fstrim(filp, argp);
-                break;
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc..7da2a06508e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
        }
        ret = -ESRCH;
-        /*
+        rcu_read_lock();
-         * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
-         * so we can't use rcu_read_lock(). See re-copy of ->ioprio
-         * in copy_process().
-         */
-        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -153,7 +148,7 @@ free_uid:
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
        int ret = -ESRCH;
        int tmpio;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53b..a0f3833c0db 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,16 +26,32 @@
 #define BEQUIET
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #endif
 static void isofs_put_super(struct super_block *sb)
@@ -65,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
+static void isofs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, isofs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct iso_inode_info *ei = foo;
@@ -160,7 +183,7 @@ struct iso9660_options{
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -181,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -206,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
 */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
+static int isofs_dentry_cmp_common(
-                                struct qstr *b, int ms)
+                unsigned int len, const char *str,
+                const struct qstr *name, int ms, int ci)
 {
        int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
+        alen = name->len;
-        blen = b->len;
+        blen = len;
        if (ms) {
-                while (alen && a->name[alen-1] == '.')
+                while (alen && name->name[alen-1] == '.')
                        alen--;
-                while (blen && b->name[blen-1] == '.')
+                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
-                if (strnicmp(a->name, b->name, alen) == 0)
+                if (ci) {
-                        return 0;
+                        if (strnicmp(name->name, str, alen) == 0)
-        }
+                                return 0;
-        return 1;
+                } else {
-}
+                        if (strncmp(name->name, str, alen) == 0)
+                                return 0;
-/*
+                }
- * Case sensitive compare of two isofs names.
- */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
-                                        struct qstr *b, int ms)
-{
-        int alen, blen;
-        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
-        blen = b->len;
-        if (ms) {
-                while (alen && a->name[alen-1] == '.')
-                        alen--;
-                while (blen && b->name[blen-1] == '.')
-                        blen--;
-        }
-        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
-                        return 0;
        }
        return 1;
 }
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 0);
 }
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 0);
 }
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 1);
 }
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 1);
 }
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
@@ -922,17 +939,18 @@ root_found:
                goto out_iput;
        }
-        /* get the root dentry */
-        s->s_root = d_alloc_root(inode);
-        if (!(s->s_root))
-                goto out_no_root;
        table = 0;
        if (joliet_level)
                table += 2;
        if (opt.check == 'r')
                table++;
-        s->s_root->d_op = &isofs_dentry_ops[table];
+        s->s_d_op = &isofs_dentry_ops[table];
+        /* get the root dentry */
+        s->s_root = d_alloc_root(inode);
+        if (!(s->s_root))
+                goto out_no_root;
        kfree(opt.iocharset);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd428..4fb3e8074fd 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
        qstr.name = compare;
        qstr.len = dlen;
-        return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+        return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+                        dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 /*
@@ -171,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        struct inode *inode;
        struct page *page;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        page = alloc_page(GFP_USER);
        if (!page)
                return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f31411..5b2e4c30a2a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * journal_extend().
         */
        if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c590d155c09..9e4686900f1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
-                goto fail;
+                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err) {
                kfree(journal);
-                goto fail;
+                return NULL;
        }
        spin_lock_init(&journal->j_history_lock);
        return journal;
-fail:
-        return NULL;
 }
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                        __func__);
                goto out_err;
        }
-        journal->j_dev = bdev;
-        journal->j_fs_dev = fs_dev;
-        journal->j_blk_offset = start;
-        journal->j_maxlen = len;
-        bdevname(journal->j_dev, journal->j_devname);
-        p = journal->j_devname;
-        while ((p = strchr(p, '/')))
-                *p = '!';
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        if (!bh) {
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 #endif
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 static int __init journal_init_handle_cache(void)
 {
-        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-                                sizeof(handle_t),
-                                0,              /* offset */
-                                SLAB_TEMPORARY, /* flags */
-                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
-                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+        if (jbd2_inode_cache == NULL) {
+                printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+                kmem_cache_destroy(jbd2_handle_cache);
                return -ENOMEM;
        }
        return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
        if (jbd2_handle_cache)
                kmem_cache_destroy(jbd2_handle_cache);
+        if (jbd2_inode_cache)
+                kmem_cache_destroy(jbd2_inode_cache);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f..1cad869494f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613..faad2bd787c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * jbd2_journal_extend().
         */
        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -774,7 +772,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                /* Fire data frozen trigger just before we copy the data */
                jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bb..95b79672150 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int rc;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d954..3119f59253d 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02..3005ec4520a 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
        size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
 #ifndef __ECOS
        if (jffs2_blocks_use_vmalloc(c))
-                c->blocks = vmalloc(size);
+                c->blocks = vzalloc(size);
        else
 #endif
-                c->blocks = kmalloc(size, GFP_KERNEL);
+                c->blocks = kzalloc(size, GFP_KERNEL);
        if (!c->blocks)
                return -ENOMEM;
-        memset(c->blocks, 0, size);
        for (i=0; i<c->nr_blocks; i++) {
                INIT_LIST_HEAD(&c->blocks[i].list);
                c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64..0bc6a6c80a5 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
        void *os_priv;
 };
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a..853b8e30008 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
        return &f->vfs_inode;
 }
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
+static void jffs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
 static void jffs2_i_init_once(void *foo)
 {
        struct jffs2_inode_info *f = foo;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a4..4f9cc048294 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rx.hdr_crc), crc);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
        if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                            je32_to_cpu(rx.xid), xd->xid,
                            je32_to_cpu(rx.version), xd->version);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->xprefix = rx.xprefix;
        xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
                              ref_offset(xd->node), xd->data_crc, crc);
                kfree(data);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
        if (xd->xname)
                return 0;
        if (xd->flags & JFFS2_XFLAGS_INVALID)
-                return EIO;
+                return -EIO;
        if (unlikely(is_xattr_datum_unchecked(c, xd)))
                rc = do_verify_xattr_datum(c, xd);
        if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
        if (crc != je32_to_cpu(rr.node_crc)) {
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rr.node_crc), crc);
-                return EIO;
+                return -EIO;
        }
        if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
            || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
                            offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
                            je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
                            je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
-                return EIO;
+                return -EIO;
        }
        ref->ino = je32_to_cpu(rr.ino);
        ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4..e5de9422fa3 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
        return rc;
 }
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878..f9285c4900f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aa..278e3fb40b7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
         * file systems to log may have n-to-1 relationship;
         */
-        bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                 log);
        if (IS_ERR(bdev)) {
                rc = -PTR_ERR(bdev);
                goto free;
        }
-        if ((rc = bd_claim(bdev, log))) {
-                goto close;
-        }
        log->bdev = bdev;
        memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
         * initialize log:
         */
        if ((rc = lmLogInit(log)))
-                goto unclaim;
+                goto close;
        list_add(&log->journal_list, &jfs_external_logs);
@@ -1163,11 +1160,8 @@ journal_found:
        list_del(&log->journal_list);
        lbmLogShutdown(log);
-      unclaim:
-        bd_release(bdev);
      close:            /* close external log device */
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
      free:             /* free log descriptor */
        mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
        bdev = log->bdev;
        rc = lmLogShutdown(log);
-        bd_release(bdev);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
        kfree(log);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 231ca4af9bc..81ead850ddb 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
 */
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/quotaops.h>
 #include <linux/exportfs.h>
@@ -1464,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
-        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                dentry->d_op = &jfs_ci_dentry_operations;
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
        else if (strcmp(name, "..") == 0)
@@ -1491,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                return ERR_CAST(ip);
        }
-        dentry = d_splice_alias(ip, dentry);
+        return d_splice_alias(ip, dentry);
-        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-                dentry->d_op = &jfs_ci_dentry_operations;
-        return dentry;
 }
 static struct inode *jfs_nfs_get_inode(struct super_block *sb,
@@ -1573,7 +1566,8 @@ const struct file_operations jfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+                struct qstr *this)
 {
        unsigned long hash;
        int i;
@@ -1586,32 +1580,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
        return 0;
 }
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i, result = 1;
-        if (a->len != b->len)
+        if (len != name->len)
                goto out;
-        for (i=0; i < a->len; i++) {
+        for (i=0; i < len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
+                if (tolower(str[i]) != tolower(name->name[i]))
                        goto out;
        }
        result = 0;
+out:
+        return result;
+}
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
-         * We want creates to preserve case.  A negative dentry, a, that
+         * This is not negative dentry. Always valid.
-         * has a different case than b may cause a new entry to be created
+         *
-         * with the wrong case.  Since we can't tell if a comes from a negative
+         * Note, rename() to existing directory entry will have ->d_inode,
-         * dentry, we blindly replace it with b.  This should be harmless if
+         * and will use existing name which isn't specified name by user.
-         * a is not a negative dentry.
+         *
+         * We may be able to drop this positive dentry here. But dropping
+         * positive dentry isn't good idea. So it's unsupported like
+         * rename("filename", "FILENAME") for now.
         */
-        memcpy((unsigned char *)a->name, b->name, a->len);
+        if (dentry->d_inode)
-out:
+                return 1;
-        return result;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        return 1;
 }
 const struct dentry_operations jfs_ci_dentry_operations =
 {
        .d_hash = jfs_ci_hash,
        .d_compare = jfs_ci_compare,
+        .d_revalidate = jfs_ci_revalidate,
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3b..eeca48a031a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
        return &jfs_inode->vfs_inode;
 }
+static void jfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(jfs_inode_cachep, ji);
+}
 static void jfs_destroy_inode(struct inode *inode)
 {
        struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-        kmem_cache_free(jfs_inode_cachep, ji);
+        call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -507,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = JFS_SUPER_MAGIC;
+        if (sbi->mntflag & JFS_OS2)
+                sb->s_d_op = &jfs_ci_dentry_operations;
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -516,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root)
                goto out_no_root;
-        if (sbi->mntflag & JFS_OS2)
-                sb->s_root->d_op = &jfs_ci_dentry_operations;
        /* logical blocks are represented by 40 bits in pxd_t, etc. */
        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
 #if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528a..c88eab55aec 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
 #include <asm/uaccess.h>
+static inline int simple_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
                   struct kstat *stat)
 {
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
        return 1;
 }
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        dentry->d_op = &simple_dentry_operations;
+        d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
 }
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-        mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = file->f_path.dentry;
+        mutex_lock(&dentry->d_inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        if (offset >= 0)
                                break;
                default:
-                        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        struct dentry *cursor = file->private_data;
                        loff_t n = file->f_pos - 2;
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
+                        /* d_lock not required for cursor */
                        list_del(&cursor->d_u.d_child);
-                        p = file->f_path.dentry->d_subdirs.next;
+                        p = dentry->d_subdirs.next;
-                        while (n && p != &file->f_path.dentry->d_subdirs) {
+                        while (n && p != &dentry->d_subdirs) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (!d_unhashed(next) && next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (simple_positive(next))
                                        n--;
+                                spin_unlock(&next->d_lock);
                                p = p->next;
                        }
                        list_add_tail(&cursor->d_u.d_child, p);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                }
        }
-        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        return offset;
 }
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        if (filp->f_pos == 2)
                                list_move(q, &dentry->d_subdirs);
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (d_unhashed(next) || !next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (!simple_positive(next)) {
+                                        spin_unlock(&next->d_lock);
                                        continue;
+                                }
-                                spin_unlock(&dcache_lock);
+                                spin_unlock(&next->d_lock);
+                                spin_unlock(&dentry->d_lock);
                                if (filldir(dirent, next->d_name.name, 
                                            next->d_name.len, filp->f_pos, 
                                            next->d_inode->i_ino, 
                                            dt_type(next->d_inode)) < 0)
                                        return 0;
-                                spin_lock(&dcache_lock);
+                                spin_lock(&dentry->d_lock);
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
                                /* next is still alive */
                                list_move(q, p);
+                                spin_unlock(&next->d_lock);
                                p = q;
                                filp->f_pos++;
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
        }
        return 0;
 }
@@ -202,7 +217,8 @@ static const struct super_operations simple_super_operations = {
 * will never be mountable)
 */
 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-        const struct super_operations *ops, unsigned long magic)
+        const struct super_operations *ops,
+        const struct dentry_operations *dops, unsigned long magic)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        struct dentry *dentry;
@@ -239,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
        dentry->d_parent = dentry;
        d_instantiate(dentry, root);
        s->s_root = dentry;
+        s->s_d_op = dops;
        s->s_flags |= MS_ACTIVE;
        return dget(s->s_root);
@@ -259,23 +276,23 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        return 0;
 }
-static inline int simple_positive(struct dentry *dentry)
-{
-        return dentry->d_inode && !d_unhashed(dentry);
-}
 int simple_empty(struct dentry *dentry)
 {
        struct dentry *child;
        int ret = 0;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
-                if (simple_positive(child))
+                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+                if (simple_positive(child)) {
+                        spin_unlock(&child->d_lock);
                        goto out;
+                }
+                spin_unlock(&child->d_lock);
+        }
        ret = 1;
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return ret;
 }
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab33..ca58d64374c 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 00000000000..f848b52c67b
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz            (0)
+#define NLM4_cookie_sz          (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz          (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz         (1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz            (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz          (6+NLM4_owner_sz)
+#define NLM4_testargs_sz        (NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz        (NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz        (NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz      (NLM4_cookie_sz+NLM4_lock_sz)
+#define NLM4_testres_sz         (NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz             (NLM4_cookie_sz+1)
+#define NLM4_norep_sz           (0)
+static s64 loff_t_to_s64(loff_t offset)
+{
+        s64 res;
+        if (offset >= NLM4_OFFSET_MAX)
+                res = NLM4_OFFSET_MAX;
+        else if (offset <= -NLM4_OFFSET_MAX)
+                res = -NLM4_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+                                 u64 *l_offset, u64 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s64(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                             struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+/*
+ *      enum nlm4_stats {
+ *              NLM4_GRANTED = 0,
+ *              NLM4_DENIED = 1,
+ *              NLM4_DENIED_NOLOCKS = 2,
+ *              NLM4_BLOCKED = 3,
+ *              NLM4_DENIED_GRACE_PERIOD = 4,
+ *              NLM4_DEADLCK = 5,
+ *              NLM4_ROFS = 6,
+ *              NLM4_STALE_FH = 7,
+ *              NLM4_FBIG = 8,
+ *              NLM4_FAILED = 9
+ *      };
+ *
+ *      struct nlm4_stat {
+ *              nlm4_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+                             const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm4_failed))
+                goto out_bad_xdr;
+        *stat = *p;
+        return 0;
+out_bad_xdr:
+        dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+                        __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm4_holder {
+ *              bool    exclusive;
+ *              int32   svid;
+ *              netobj  oh;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+                               const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u64 l_offset, l_len;
+        u32 exclusive;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 8 + 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        p = xdr_decode_hyper(p, &l_offset);
+        xdr_decode_hyper(p, &l_len);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm4_lock {
+ *              string  caller_name<LM_MAXSTRLEN>;
+ *              netobj  fh;
+ *              netobj  oh;
+ *              int32   svid;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+                             const struct nlm_lock *lock)
+{
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 8 + 8);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm4_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm4_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_unlockargs {
+ *              netobj cookie;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+}
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+        if (result->status == nlm_lck_denied)
+                encode_nlm4_holder(xdr, result);
+}
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm4_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm4_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep      NULL
+#define PROC(proc, argtype, restype)                                    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,             \
+        .p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,             \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm4_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+struct rpc_version      nlm_version4 = {
+        .number         = 4,
+        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
+        .procs          = nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6..8d4ea8351e3 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/smp_lock.h>
 #include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -80,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
 */
 void nlmclnt_done(struct nlm_host *host)
 {
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -274,7 +273,7 @@ restart:
        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b..adb45ec9038 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
 */
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
@@ -59,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
                return;
        list_del(&lockowner->list);
        spin_unlock(&lockowner->host->h_lock);
-        nlm_release_host(lockowner->host);
+        nlmclnt_release_host(lockowner->host);
        kfree(lockowner);
 }
@@ -208,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        return NULL;
 }
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
        if (!atomic_dec_and_test(&call->a_count))
                return;
-        nlm_release_host(call->a_host);
+        nlmclnt_release_host(call->a_host);
        nlmclnt_release_lockargs(call);
        kfree(call);
 }
 static void nlmclnt_rpc_release(void *data)
 {
-        nlm_release_call(data);
+        nlmclnt_release_call(data);
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -437,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                        status = nlm_stat_to_errno(req->a_res.status);
        }
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -594,7 +593,7 @@ again:
 out_unblock:
        nlmclnt_finish_block(block);
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 out_unlock:
        /* Fatal error: ensure that we remove the lock altogether */
@@ -695,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -756,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
                        NLMPROC_CANCEL, &nlmclnt_cancel_ops);
        if (status == 0 && req->a_res.status == nlm_lck_denied)
                status = -ENOLCK;
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 00000000000..180ac34feb9
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz           (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz            (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz          (1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz             (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz           (4+NLM_owner_sz)
+#define NLM_testargs_sz         (NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz         (NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz         (NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz       (NLM_cookie_sz+NLM_lock_sz)
+#define NLM_testres_sz          (NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz              (NLM_cookie_sz+1)
+#define NLM_norep_sz            (0)
+static s32 loff_t_to_s32(loff_t offset)
+{
+        s32 res;
+        if (offset >= NLM_OFFSET_MAX)
+                res = NLM_OFFSET_MAX;
+        else if (offset <= -NLM_OFFSET_MAX)
+                res = -NLM_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+                                u32 *l_offset, u32 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s32(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                         struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+/*
+ *      enum nlm_stats {
+ *              LCK_GRANTED = 0,
+ *              LCK_DENIED = 1,
+ *              LCK_DENIED_NOLOCKS = 2,
+ *              LCK_BLOCKED = 3,
+ *              LCK_DENIED_GRACE_PERIOD = 4
+ *      };
+ *
+ *
+ *      struct nlm_stat {
+ *              nlm_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm_stat(struct xdr_stream *xdr,
+                            const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm_stat(struct xdr_stream *xdr,
+                           __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm_lck_denied_grace_period))
+                goto out_enum;
+        *stat = *p;
+        return 0;
+out_enum:
+        dprintk("%s: server returned invalid nlm_stats value: %u\n",
+                __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm_holder {
+ *              bool exclusive;
+ *              int uppid;
+ *              netobj oh;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+                              const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u32 exclusive, l_offset, l_len;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        l_offset = be32_to_cpup(p++);
+        l_len = be32_to_cpup(p);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm_lock {
+ *              string caller_name<LM_MAXSTRLEN>;
+ *              netobj fh;
+ *              netobj oh;
+ *              int uppid;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+                            const struct nlm_lock *lock)
+{
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_unlockargs {
+ *              netobj cookie;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+}
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        if (result->status == nlm_lck_denied)
+                encode_nlm_holder(xdr, result);
+}
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+        encode_nlm_testrply(xdr, result);
+}
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+                               struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+                           struct xdr_stream *xdr,
+                           struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep       NULL
+#define PROC(proc, argtype, restype)    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,              \
+        .p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,              \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+static struct rpc_version       nlm_version1 = {
+                .number         = 1,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       nlm_version3 = {
+                .number         = 3,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       *nlm_versions[] = {
+        [1] = &nlm_version1,
+        [3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+        [4] = &nlm_version4,
+#endif
+};
+static struct rpc_stat          nlm_rpc_stats;
+struct rpc_program              nlm_program = {
+                .name           = "lockd",
+                .number         = NLM_PROGRAM,
+                .nrvers         = ARRAY_SIZE(nlm_versions),
+                .version        = nlm_versions,
+                .stats          = &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b..b7c99bfb3da 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
-static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_client_hosts[NLM_HOST_NRHASH];
+#define for_each_host(host, pos, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry((host), (pos), (chain), h_hash)
+#define for_each_host_safe(host, pos, next, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry_safe((host), (pos), (next), \
+                                                (chain), h_hash)
 static unsigned long            next_gc;
-static int                      nrhosts;
+static unsigned long            nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
        const u32               version;        /* NLM version to search for */
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
-        const struct sockaddr   *src_sap;       /* our address (optional) */
-        const size_t            src_len;        /* it's length */
        const int               noresvport;     /* use non-priv port */
 };
@@ -88,126 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
 */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+                                       struct nsm_handle *nsm)
 {
-        struct hlist_head *chain;
+        struct nlm_host *host = NULL;
-        struct hlist_node *pos;
+        unsigned long now = jiffies;
-        struct nlm_host *host;
-        struct nsm_handle *nsm = NULL;
-        mutex_lock(&nlm_host_mutex);
-        if (time_after_eq(jiffies, next_gc))
-                nlm_gc_hosts();
-        /* We may keep several nlm_host objects for a peer, because each
-         * nlm_host is identified by
-         * (address, protocol, version, server/client)
-         * We could probably simplify this a little by putting all those
-         * different NLM rpc_clients into one single nlm_host object.
-         * This would allow us to have one nlm_host per address.
-         */
-        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-                        continue;
-                /* See if we have an NSM handle for this client */
-                if (!nsm)
-                        nsm = host->h_nsmhandle;
-                if (host->h_proto != ni->protocol)
-                        continue;
-                if (host->h_version != ni->version)
-                        continue;
-                if (host->h_server != ni->server)
-                        continue;
-                if (ni->server &&
-                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-                        continue;
-                /* Move to head of hash chain. */
-                hlist_del(&host->h_hash);
-                hlist_add_head(&host->h_hash, chain);
-                nlm_get_host(host);
-                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-                                host->h_name, host->h_addrbuf);
-                goto out;
-        }
-        /*
+        if (nsm != NULL)
-         * The host wasn't in our hash table.  If we don't
-         * have an NSM handle for it yet, create one.
-         */
-        if (nsm)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
                nsm = nsm_get_handle(ni->sap, ni->salen,
                                        ni->hostname, ni->hostname_len);
-                if (!nsm) {
+                if (unlikely(nsm == NULL)) {
-                        dprintk("lockd: nlm_lookup_host failed; "
+                        dprintk("lockd: %s failed; no nsm handle\n",
-                                "no nsm handle\n");
+                                __func__);
                        goto out;
                }
        }
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = kmalloc(sizeof(*host), GFP_KERNEL);
-        if (!host) {
+        if (unlikely(host == NULL)) {
+                dprintk("lockd: %s failed; no memory\n", __func__);
                nsm_release(nsm);
-                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
-        host->h_name       = nsm->sm_name;
-        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addrlen = ni->salen;
+        host->h_addrlen    = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
-        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = 0;
+        host->h_rpcclnt    = NULL;
+        host->h_name       = nsm->sm_name;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
-        host->h_rpcclnt    = NULL;
+        host->h_reclaiming = 0;
-        mutex_init(&host->h_mutex);
+        host->h_server     = ni->server;
-        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+        host->h_noresvport = ni->noresvport;
-        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
+        host->h_inuse      = 0;
-        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
        init_rwsem(&host->h_rwsem);
-        host->h_state      = 0;                 /* pseudo NSM state */
+        host->h_state      = 0;
-        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmstate   = 0;
-        host->h_nsmhandle  = nsm;
+        host->h_pidcount   = 0;
-        host->h_server     = ni->server;
+        atomic_set(&host->h_count, 1);
-        host->h_noresvport = ni->noresvport;
+        mutex_init(&host->h_mutex);
-        hlist_add_head(&host->h_hash, chain);
+        host->h_nextrebind = now + NLM_HOST_REBIND;
+        host->h_expires    = now + NLM_HOST_EXPIRE;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
        INIT_LIST_HEAD(&host->h_reclaim);
+        host->h_nsmhandle  = nsm;
-        nrhosts++;
+        host->h_addrbuf    = nsm->sm_addrbuf;
-        dprintk("lockd: nlm_lookup_host created host %s\n",
-                        host->h_name);
 out:
-        mutex_unlock(&nlm_host_mutex);
        return host;
 }
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
 */
-static void
+static void nlm_destroy_host_locked(struct nlm_host *host)
-nlm_destroy_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
+        dprintk("lockd: destroy host %s\n", host->h_name);
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
+        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
        nsm_release(host->h_nsmhandle);
@@ -215,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
        if (clnt != NULL)
                rpc_shutdown_client(clnt);
        kfree(host);
+        nrhosts--;
 }
 /**
@@ -238,9 +208,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const char *hostname,
                                     int noresvport)
 {
-        const struct sockaddr source = {
-                .sa_family      = AF_UNSPEC,
-        };
        struct nlm_lookup_host_info ni = {
                .server         = 0,
                .sap            = sap,
@@ -249,16 +216,78 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .version        = version,
                .hostname       = hostname,
                .hostname_len   = strlen(hostname),
-                .src_sap        = &source,
-                .src_len        = sizeof(source),
                .noresvport     = noresvport,
        };
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
                        (hostname ? hostname : "<none>"), version,
                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
-        return nlm_lookup_host(&ni);
+        mutex_lock(&nlm_host_mutex);
+        chain = &nlm_client_hosts[nlm_hash_address(sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != protocol)
+                        continue;
+                if (host->h_version != version)
+                        continue;
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n", __func__,
+                        host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n", __func__,
+                host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release client host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(host->h_server);
+        if (atomic_dec_and_test(&host->h_count)) {
+                BUG_ON(!list_empty(&host->h_lockowners));
+                BUG_ON(!list_empty(&host->h_granted));
+                BUG_ON(!list_empty(&host->h_reclaim));
+                mutex_lock(&nlm_host_mutex);
+                nlm_destroy_host_locked(host);
+                mutex_unlock(&nlm_host_mutex);
+        }
 }
 /**
@@ -283,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                                    const char *hostname,
                                    const size_t hostname_len)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host = NULL;
+        struct nsm_handle *nsm = NULL;
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
        };
        struct sockaddr_in6 sin6 = {
                .sin6_family    = AF_INET6,
        };
+        struct sockaddr *src_sap;
+        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -297,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
-                .src_len        = rqstp->rq_addrlen,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
                        (int)hostname_len, hostname, rqstp->rq_vers,
                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        mutex_lock(&nlm_host_mutex);
        switch (ni.sap->sa_family) {
        case AF_INET:
                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                ni.src_sap = (struct sockaddr *)&sin;
+                src_sap = (struct sockaddr *)&sin;
                break;
        case AF_INET6:
                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                ni.src_sap = (struct sockaddr *)&sin6;
+                src_sap = (struct sockaddr *)&sin6;
                break;
        default:
-                return NULL;
+                dprintk("lockd: %s failed; unrecognized address family\n",
+                        __func__);
+                goto out;
        }
-        return nlm_lookup_host(&ni);
+        if (time_after_eq(jiffies, next_gc))
+                nlm_gc_hosts();
+        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != ni.protocol)
+                        continue;
+                if (host->h_version != ni.version)
+                        continue;
+                if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+                        continue;
+                /* Move to head of hash chain. */
+                hlist_del(&host->h_hash);
+                hlist_add_head(&host->h_hash, chain);
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n",
+                        __func__, host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        memcpy(nlm_srcaddr(host), src_sap, src_len);
+        host->h_srcaddrlen = src_len;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n",
+                __func__, host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release server host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(!host->h_server);
+        atomic_dec(&host->h_count);
 }
 /*
@@ -357,7 +456,6 @@ nlm_bind_host(struct nlm_host *host)
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
-                        .saddress       = nlm_srcaddr(host),
                        .timeout        = &timeparms,
                        .servername     = host->h_name,
                        .program        = &nlm_program,
@@ -376,6 +474,8 @@ nlm_bind_host(struct nlm_host *host)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
                if (host->h_noresvport)
                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+                if (host->h_srcaddrlen)
+                        args.saddress = nlm_srcaddr(host);
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
@@ -416,20 +516,29 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
        return host;
 }
-/*
+static struct nlm_host *next_host_state(struct hlist_head *cache,
- * Release NLM host after use
+                                        struct nsm_handle *nsm,
- */
+                                        const struct nlm_reboot *info)
-void nlm_release_host(struct nlm_host *host)
 {
-        if (host != NULL) {
+        struct nlm_host *host;
-                dprintk("lockd: release host %s\n", host->h_name);
+        struct hlist_head *chain;
-                BUG_ON(atomic_read(&host->h_count) < 0);
+        struct hlist_node *pos;
-                if (atomic_dec_and_test(&host->h_count)) {
-                        BUG_ON(!list_empty(&host->h_lockowners));
+        mutex_lock(&nlm_host_mutex);
-                        BUG_ON(!list_empty(&host->h_granted));
+        for_each_host(host, pos, chain, cache) {
-                        BUG_ON(!list_empty(&host->h_reclaim));
+                if (host->h_nsmhandle == nsm
+                    && host->h_nsmstate != info->state) {
+                        host->h_nsmstate = info->state;
+                        host->h_state++;
+                        nlm_get_host(host);
+                        mutex_unlock(&nlm_host_mutex);
+                        return host;
                }
        }
+        mutex_unlock(&nlm_host_mutex);
+        return NULL;
 }
 /**
@@ -441,8 +550,6 @@ void nlm_release_host(struct nlm_host *host)
 */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-        struct hlist_head *chain;
-        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
@@ -455,32 +562,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
         * lock for this.
         * To avoid processing a host several times, we match the nsmstate.
         */
-again:  mutex_lock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                nlmsvc_free_host_resources(host);
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                nlmsvc_release_host(host);
-                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = info->state;
-                                host->h_state++;
-                                nlm_get_host(host);
-                                mutex_unlock(&nlm_host_mutex);
-                                if (host->h_server) {
-                                        /* We're server for this guy, just ditch
-                                         * all the locks he held. */
-                                        nlmsvc_free_host_resources(host);
-                                } else {
-                                        /* He's the server, initiate lock recovery. */
-                                        nlmclnt_recovery(host);
-                                }
-                                nlm_release_host(host);
-                                goto again;
-                        }
-                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+                nlmclnt_recovery(host);
+                nlmclnt_release_host(host);
+        }
        nsm_release(nsm);
 }
@@ -500,13 +590,11 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts) {
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                host->h_expires = jiffies - 1;
-                        host->h_expires = jiffies - 1;
+                if (host->h_rpcclnt) {
-                        if (host->h_rpcclnt) {
+                        rpc_shutdown_client(host->h_rpcclnt);
-                                rpc_shutdown_client(host->h_rpcclnt);
+                        host->h_rpcclnt = NULL;
-                                host->h_rpcclnt = NULL;
-                        }
                }
        }
@@ -515,15 +603,13 @@ nlm_shutdown_hosts(void)
        mutex_unlock(&nlm_host_mutex);
        /* complain if any hosts are left */
-        if (nrhosts) {
+        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-                dprintk("lockd: %d hosts left:\n", nrhosts);
+                dprintk("lockd: %lu hosts left:\n", nrhosts);
-                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        hlist_for_each_entry(host, pos, chain, h_hash) {
+                        dprintk("       %s (cnt %d use %d exp %ld)\n",
-                                dprintk("       %s (cnt %d use %d exp %ld)\n",
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_inuse, host->h_expires);
-                                        host->h_inuse, host->h_expires);
-                        }
                }
        }
 }
@@ -541,29 +627,22 @@ nlm_gc_hosts(void)
        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts)
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                host->h_inuse = 0;
-                        host->h_inuse = 0;
-        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
-                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
+                if (atomic_read(&host->h_count) || host->h_inuse
-                        if (atomic_read(&host->h_count) || host->h_inuse
+                 || time_before(jiffies, host->h_expires)) {
-                         || time_before(jiffies, host->h_expires)) {
+                        dprintk("nlm_gc_hosts skipping %s "
-                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
+                                "(cnt %d use %d exp %ld)\n",
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires);
-                                continue;
+                        continue;
-                        }
-                        dprintk("lockd: delete host %s\n", host->h_name);
-                        hlist_del_init(&host->h_hash);
-                        nlm_destroy_host(host);
-                        nrhosts--;
                }
+                nlm_destroy_host_locked(host);
        }
        next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c91894964..23d7451b293 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
 * Status Monitor wire protocol.
 */
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
        const u32 len = strlen(string);
        __be32 *p;
-        if (unlikely(len > SM_MAXSTRLEN))
+        BUG_ON(len > SM_MAXSTRLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, string, len);
-        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
 */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        return encode_nsm_string(xdr, argp->mon_name);
+        encode_nsm_string(xdr, argp->mon_name);
 }
 /*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
        __be32 *p;
-        status = encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, utsname()->nodename);
-        if (unlikely(status != 0))
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-                return status;
+        *p++ = cpu_to_be32(argp->prog);
-        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        *p++ = cpu_to_be32(argp->vers);
-        if (unlikely(p == NULL))
+        *p = cpu_to_be32(argp->proc);
-                return -EIO;
-        *p++ = htonl(argp->prog);
-        *p++ = htonl(argp->vers);
-        *p++ = htonl(argp->proc);
-        return 0;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
 * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
+        encode_mon_name(xdr, argp);
+        encode_my_id(xdr, argp);
-        status = encode_mon_name(xdr, argp);
-        if (unlikely(status != 0))
-                return status;
-        return encode_my_id(xdr, argp);
 }
 /*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 * by the NSMPROC_MON call. This information will be supplied in the
 * NLMPROC_SM_NOTIFY call.
 */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-        return 0;
 }
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                       const struct nsm_args *argp)
+                            const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        int status;
+        encode_priv(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        status = encode_mon_id(&xdr, argp);
-        if (unlikely(status))
-                return status;
-        return encode_priv(&xdr, argp);
 }
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                         const struct nsm_args *argp)
+                              const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mon_id(&xdr, argp);
 }
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
-                            struct nsm_res *resp)
+                                struct xdr_stream *xdr,
+                                struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->status = ntohl(*p++);
+        resp->status = be32_to_cpup(p++);
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
+        dprintk("lockd: %s status %d state %d\n",
-                        resp->status, resp->state);
+                __func__, resp->status, resp->state);
        return 0;
 }
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
-                        struct nsm_res *resp)
+                            struct xdr_stream *xdr,
+                            struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_inline_decode(&xdr, sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+        dprintk("lockd: %s state %d\n", __func__, resp->state);
        return 0;
 }
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo      nsm_procedures[] = {
 [NSMPROC_MON] = {
                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t)xdr_enc_mon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_mon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
                .p_statidx      = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 [NSMPROC_UNMON] = {
                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
                .p_statidx      = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475..9a41fdc1951 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
@@ -52,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -93,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -135,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -165,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -198,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unlock(file, &argp->lock);
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -230,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -262,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -335,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_share_file(host, file, argp);
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -368,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unshare_file(host, file, argp);
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -400,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acb..6e31695d046 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/nlm.h>
@@ -47,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
@@ -234,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
        kfree(block);
 failed:
-        nlm_release_call(call);
+        nlmsvc_release_call(call);
        return NULL;
 }
@@ -267,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
        mutex_unlock(&file->f_mutex);
        nlmsvc_freegrantargs(block->b_call);
-        nlm_release_call(block->b_call);
+        nlmsvc_release_call(block->b_call);
        nlm_release_file(block->b_file);
        kfree(block->b_fl);
        kfree(block);
@@ -935,3 +935,32 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d60..d27aab11f32 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
@@ -81,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -123,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                dprintk("lockd: TEST          status %d vers %d\n",
                        ntohl(resp->status), rqstp->rq_vers);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -165,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -195,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -228,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -258,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
                        -task->tk_status);
 }
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+        if (!atomic_dec_and_test(&call->a_count))
+                return;
+        nlmsvc_release_host(call->a_host);
+        kfree(call);
+}
 static void nlmsvc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -292,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -367,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -400,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -432,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cb..964666c68a8 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s32                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm_encode_fh(p, &lock->fh))
-         || !(p = nlm_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM_OFFSET_MAX
-         || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        start = loff_t_to_s32(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-        *p++ = htonl(lock->svid);
-        *p++ = htonl(start);
-        *p++ = htonl(len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                s32                     start, len, end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                start = ntohl(*p++);
-                len = ntohl(*p++);
-                end = start + len - 1;
-                fl->fl_start = s32_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s32_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz             0
-#define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_owner_sz
-#define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz         NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz       NLM_cookie_sz+NLM_lock_sz
-#define NLM_testres_sz          NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz              NLM_cookie_sz+1
-#define NLM_norep_sz            0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep     NULL
-#define PROC(proc, argtype, restype)    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
-        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_arglen    = NLM_##argtype##_sz,                              \
-        .p_replen    = NLM_##restype##_sz,                              \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-static struct rpc_version       nlm_version1 = {
-                .number         = 1,
-                .nrprocs        = 16,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version       nlm_version3 = {
-                .number         = 3,
-                .nrprocs        = 24,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version *     nlm_versions[] = {
-        [1] = &nlm_version1,
-        [3] = &nlm_version3,
-#ifdef  CONFIG_LOCKD_V4
-        [4] = &nlm_version4,
-#endif
-};
-static struct rpc_stat          nlm_stats;
-struct rpc_program              nlm_program = {
-                .name           = "lockd",
-                .number         = NLM_PROGRAM,
-                .nrvers         = ARRAY_SIZE(nlm_versions),
-                .version        = nlm_versions,
-                .stats          = &nlm_stats,
-};
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145..dfa4789cd46 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(f->size);
-        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-        memcpy(p, f->data, f->size);
-        return p + XDR_QUADLEN(f->size);
-}
 /*
 * Encode and decode owner handle
 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-        return xdr_encode_netobj(p, oh);
-}
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s64                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm4_encode_fh(p, &lock->fh))
-         || !(p = nlm4_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM4_OFFSET_MAX
-         || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        *p++ = htonl(lock->svid);
-        start = loff_t_to_s64(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-        p = xdr_encode_hyper(p, start);
-        p = xdr_encode_hyper(p, len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                __u64                   start, len;
-                __s64                   end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                p = xdr_decode_hyper(p, &start);
-                p = xdr_decode_hyper(p, &len);
-                end = start + len - 1;
-                fl->fl_start = s64_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s64_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz            0
-#define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_owner_sz
-#define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz        NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz      NLM4_cookie_sz+NLM4_lock_sz
-#define NLM4_testres_sz         NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz             NLM4_cookie_sz+1
-#define NLM4_norep_sz           0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep    NULL
-#define PROC(proc, argtype, restype)                                    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
-        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_arglen    = NLM4_##argtype##_sz,                             \
-        .p_replen    = NLM4_##restype##_sz,                             \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm4_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-struct rpc_version      nlm_version4 = {
-        .number         = 4,
-        .nrprocs        = 24,
-        .procs          = nlm4_procedures,
-};
diff --git a/fs/locks.c b/fs/locks.c
index 65765cb6afe..0f3998291f7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
 #include <linux/module.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/rcupdate.h>
@@ -445,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
        fl->fl_file->f_owner.signum = 0;
 }
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-        return fl->fl_file == try->fl_file;
-}
 static const struct lock_manager_operations lease_manager_ops = {
        .fl_break = lease_break_callback,
        .fl_release_private = lease_release_private_callback,
-        .fl_mylease = lease_mylease_callback,
        .fl_change = lease_modify,
 };
@@ -1390,7 +1383,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
                if ((arg == F_WRLCK)
-                    && ((atomic_read(&dentry->d_count) > 1)
+                    && ((dentry->d_count > 1)
                        || (atomic_read(&inode->i_count) > 1)))
                        goto out;
        }
@@ -1406,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        for (before = &inode->i_flock;
                        ((fl = *before) != NULL) && IS_LEASE(fl);
                        before = &fl->fl_next) {
-                if (lease->fl_lmops->fl_mylease(fl, lease))
+                if (fl->fl_file == filp)
                        my_before = before;
                else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
                        /*
@@ -1504,9 +1497,8 @@ static int do_fcntl_delete_lease(struct file *filp)
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
-        struct file_lock *fl;
+        struct file_lock *fl, *ret;
        struct fasync_struct *new;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        int error;
        fl = lease_alloc(filp, arg);
@@ -1518,13 +1510,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
                locks_free_lock(fl);
                return -ENOMEM;
        }
+        ret = fl;
        lock_flocks();
-        error = __vfs_setlease(filp, arg, &fl);
+        error = __vfs_setlease(filp, arg, &ret);
        if (error) {
                unlock_flocks();
                locks_free_lock(fl);
                goto out_free_fasync;
        }
+        if (ret != fl)
+                locks_free_lock(fl);
        /*
         * fasync_insert_entry() returns the old entry if any.
@@ -1532,17 +1527,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
         * inserted it into the fasync list. Clear new so that
         * we don't release it here.
         */
-        if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+        if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
                new = NULL;
-        if (error < 0) {
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-                /* remove lease just inserted by setlease */
-                fl->fl_type = F_UNLCK | F_INPROGRESS;
-                fl->fl_break_time = jiffies - 10;
-                time_out_leases(inode);
-        } else {
-                error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        }
        unlock_flocks();
 out_free_fasync:
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09b..723bc5bca09 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 static void bdev_put_device(struct logfs_super *s)
 {
-        close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
        struct block_device *bdev;
-        bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+        bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                  type);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
                int mtdnr = MINOR(bdev->bd_dev);
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
                return logfs_get_sb_mtd(p, mtdnr);
        }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a..f9ddf0c388c 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
        return __logfs_create(dir, dentry, inode, target, destlen);
 }
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        return generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098..03b8c240aed 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
        return __logfs_iget(sb, ino);
 }
+static void logfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
 static void __logfs_destroy_inode(struct inode *inode)
 {
        struct logfs_inode *li = logfs_inode(inode);
        BUG_ON(li->li_block);
        list_del(&li->li_freeing_list);
-        kmem_cache_free(logfs_inode_cache, li);
+        call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e13..9da29706f91 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
-                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
                BUG_ON(err); /* mempool should prevent this */
                err = logfs_erase_segment(sb, segno, 1);
                BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index cd51a36b37f..57afd4a6fab 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -486,7 +486,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
 #else
 static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e18..ee99a9f5dfd 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
        err = logfs_write_buf(master_inode, page, 0);
+        if (err)
+                move_page_to_inode(inode, page);
        logfs_put_write_page(page);
        return err;
 }
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237..a25444ab2ba 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
-struct mb_cache {
-        struct list_head                c_cache_list;
-        const char                      *c_name;
-        atomic_t                        c_entry_count;
-        int                             c_max_entries;
-        int                             c_bucket_bits;
-        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
-        struct list_head                *c_index_hash;
-};
 /*
 * Global data: list of all mbcache's, lru list, and a spinlock for
 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a3..ae0b83f476a 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
+static void minix_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, minix_i_callback);
+}
 static void init_once(void *foo)
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index c0d35a3acce..ce7337ddfdb 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea55..d78455a81ec 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+                if (bio_data_dir(bio) == READ) {
-                if (uptodate) {
+                        if (uptodate) {
-                        SetPageUptodate(page);
+                                SetPageUptodate(page);
-                } else {
+                        } else {
-                        ClearPageUptodate(page);
+                                ClearPageUptodate(page);
-                        SetPageError(page);
+                                SetPageError(page);
-                }
+                        }
-                unlock_page(page);
+                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
+                } else { /* bio_data_dir(bio) == WRITE */
-        bio_put(bio);
+                        if (!uptodate) {
-}
+                                SetPageError(page);
+                                if (page->mapping)
-static void mpage_end_io_write(struct bio *bio, int err)
+                                        set_bit(AS_EIO, &page->mapping->flags);
-{
+                        }
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+                        end_page_writeback(page);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate){
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
                }
-                end_page_writeback(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
 }
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-        bio->bi_end_io = mpage_end_io_read;
+        bio->bi_end_io = mpage_end_io;
-        if (rw == WRITE)
-                bio->bi_end_io = mpage_end_io_write;
        submit_bio(rw, bio);
        return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b737..7d77f24d32a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
 * This does basic POSIX ACL permission checking
 */
-static int acl_permission_check(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        umode_t                 mode = inode->i_mode;
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-                        int error = check_acl(inode, mask);
+                        int error = check_acl(inode, mask, flags);
                        if (error != -EAGAIN)
                                return error;
                }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
 }
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
+ * @flags:      IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
 */
-int generic_permission(struct inode *inode, int mask,
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+        int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        int ret;
        /*
         * Do the basic POSIX ACL permission checks.
         */
-        ret = acl_permission_check(inode, mask, check_acl);
+        ret = acl_permission_check(inode, mask, flags, check_acl);
        if (ret != -EACCES)
                return ret;
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
        }
        if (inode->i_op->permission)
-                retval = inode->i_op->permission(inode, mask);
+                retval = inode->i_op->permission(inode, mask, 0);
        else
-                retval = generic_permission(inode, mask, inode->i_op->check_acl);
+                retval = generic_permission(inode, mask, 0,
+                                inode->i_op->check_acl);
        if (retval)
                return retval;
@@ -375,6 +381,181 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 /**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_drop_rcu(nd);
+        return 0;
+}
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *parent = nd->path.dentry;
+        /*
+         * It can be possible to revalidate the dentry that we started
+         * the path walk with. force_reval_path may also revalidate the
+         * dentry already committed to the nameidata.
+         */
+        if (unlikely(parent == dentry))
+                return nameidata_drop_rcu(nd);
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        /*
+         * If the sequence check on the child dentry passed, then the child has
+         * not been removed from its parent. This means the parent dentry must
+         * be valid and able to take a reference at this point.
+         */
+        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+        BUG_ON(!parent->d_count);
+        parent->d_count++;
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_dentry_drop_rcu(nd, dentry);
+        return 0;
+}
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        nd->flags &= ~LOOKUP_RCU;
+        nd->root.mnt = NULL;
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err_unlock;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return 0;
+err_unlock:
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+        if (likely(nd->flags & LOOKUP_RCU))
+                return nameidata_drop_rcu_last(nd);
+        return 0;
+}
+/**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
@@ -386,10 +567,33 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        int status;
+        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (status == -ECHILD) {
+                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return status;
+                status = dentry->d_op->d_revalidate(dentry, nd);
+        }
+        return status;
+}
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status = dentry->d_op->d_revalidate(dentry, nd);
+        int status;
+        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -397,19 +601,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * the dentry otherwise d_revalidate is asking us
                 * to return a fail status.
                 */
-                if (!status) {
+                if (status < 0) {
+                        /* If we're in rcu-walk, we don't have a ref */
+                        if (!(nd->flags & LOOKUP_RCU))
+                                dput(dentry);
+                        dentry = ERR_PTR(status);
+                } else {
+                        /* Don't d_invalidate in rcu-walk mode */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+                                return ERR_PTR(-ECHILD);
                        if (!d_invalidate(dentry)) {
                                dput(dentry);
                                dentry = NULL;
                        }
-                } else {
-                        dput(dentry);
-                        dentry = ERR_PTR(status);
                }
        }
        return dentry;
 }
+static inline int need_reval_dot(struct dentry *dentry)
+{
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+                return 0;
+        return 1;
+}
 /*
 * force_reval_path - force revalidation of a dentry
 *
@@ -433,17 +654,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
        /*
         * only check on filesystems where it's possible for the dentry to
-         * become stale. It's assumed that if this flag is set then the
+         * become stale.
-         * d_revalidate op will also be defined.
         */
-        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
+        if (!need_reval_dot(dentry))
                return 0;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
        if (!status) {
+                /* Don't d_invalidate in rcu-walk mode */
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
                d_invalidate(dentry);
                status = -ESTALE;
        }
@@ -459,26 +682,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
 * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission(struct inode *inode)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
        if (inode->i_op->permission) {
-                ret = inode->i_op->permission(inode, MAY_EXEC);
+                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
-                if (!ret)
+        } else {
-                        goto ok;
+                ret = acl_permission_check(inode, MAY_EXEC, flags,
-                return ret;
+                                inode->i_op->check_acl);
        }
-        ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+        if (likely(!ret))
-        if (!ret)
                goto ok;
+        if (ret == -ECHILD)
+                return ret;
        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
 ok:
-        return security_inode_permission(inode, MAY_EXEC);
+        return security_inode_exec_permission(inode, flags);
 }
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +713,23 @@ static __always_inline void set_root(struct nameidata *nd)
 static int link_path_walk(const char *, struct nameidata *);
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                } while (read_seqcount_retry(&fs->seq, seq));
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+        int ret;
        if (IS_ERR(link))
                goto fail;
@@ -500,8 +739,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                nd->path = nd->root;
                path_get(&nd->root);
        }
+        nd->inode = nd->path.dentry->d_inode;
-        return link_path_walk(link, nd);
+        ret = link_path_walk(link, nd);
+        return ret;
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -514,30 +755,30 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
                mntput(path->mnt);
 }
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+                                        struct nameidata *nd)
 {
-        dput(nd->path.dentry);
+        if (!(nd->flags & LOOKUP_RCU)) {
-        if (nd->path.mnt != path->mnt) {
+                dput(nd->path.dentry);
-                mntput(nd->path.mnt);
+                if (nd->path.mnt != path->mnt)
-                nd->path.mnt = path->mnt;
+                        mntput(nd->path.mnt);
        }
+        nd->path.mnt = path->mnt;
        nd->path.dentry = path->dentry;
 }
 static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 {
        int error;
-        struct dentry *dentry = path->dentry;
+        struct dentry *dentry = link->dentry;
-        touch_atime(path->mnt, dentry);
+        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
-        if (path->mnt != nd->path.mnt) {
+        if (link->mnt == nd->path.mnt)
-                path_to_nameidata(path, nd);
+                mntget(link->mnt);
-                dget(dentry);
-        }
-        mntget(path->mnt);
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -591,6 +832,20 @@ loop:
        return err;
 }
+static int follow_up_rcu(struct path *path)
+{
+        struct vfsmount *parent;
+        struct dentry *mountpoint;
+        parent = path->mnt->mnt_parent;
+        if (parent == path->mnt)
+                return 0;
+        mountpoint = path->mnt->mnt_mountpoint;
+        path->dentry = mountpoint;
+        path->mnt = parent;
+        return 1;
+}
 int follow_up(struct path *path)
 {
        struct vfsmount *parent;
@@ -612,58 +867,295 @@ int follow_up(struct path *path)
        return 1;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
 */
-static int __follow_mount(struct path *path)
+static int follow_automount(struct path *path, unsigned flags,
+                            bool *need_mntput)
 {
-        int res = 0;
+        struct vfsmount *mnt;
-        while (d_mountpoint(path->dentry)) {
+        int err;
-                struct vfsmount *mounted = lookup_mnt(path);
-                if (!mounted)
+        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
-                        break;
+                return -EREMOTE;
+        /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+         * and this is the terminal part of the path.
+         */
+        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+                return -EISDIR; /* we actually want to stop here */
+        /* We want to mount if someone is trying to open/create a file of any
+         * type under the mountpoint, wants to traverse through the mountpoint
+         * or wants to open the mounted directory.
+         *
+         * We don't want to mount if someone's just doing a stat and they've
+         * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+         * appended a '/' to the name.
+         */
+        if (!(flags & LOOKUP_FOLLOW) &&
+            !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+                       LOOKUP_OPEN | LOOKUP_CREATE)))
+                return -EISDIR;
+        current->total_link_count++;
+        if (current->total_link_count >= 40)
+                return -ELOOP;
+        mnt = path->dentry->d_op->d_automount(path);
+        if (IS_ERR(mnt)) {
+                /*
+                 * The filesystem is allowed to return -EISDIR here to indicate
+                 * it doesn't want to automount.  For instance, autofs would do
+                 * this so that its userspace daemon can mount on this dentry.
+                 *
+                 * However, we can only permit this if it's a terminal point in
+                 * the path being looked up; if it wasn't then the remainder of
+                 * the path is inaccessible and we should say so.
+                 */
+                if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+                        return -EREMOTE;
+                return PTR_ERR(mnt);
+        }
+        if (!mnt) /* mount collision */
+                return 0;
+        err = finish_automount(mnt, path);
+        switch (err) {
+        case -EBUSY:
+                /* Someone else made a mount here whilst we were busy */
+                return 0;
+        case 0:
                dput(path->dentry);
-                if (res)
+                if (*need_mntput)
                        mntput(path->mnt);
+                path->mnt = mnt;
+                path->dentry = dget(mnt->mnt_root);
+                *need_mntput = true;
+                return 0;
+        default:
+                return err;
+        }
+}
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
+{
+        unsigned managed;
+        bool need_mntput = false;
+        int ret;
+        /* Given that we're not holding a lock here, we retain the value in a
+         * local variable for each dentry as we look at it so that we don't see
+         * the components of that value change under us */
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
+               managed &= DCACHE_MANAGED_DENTRY,
+               unlikely(managed != 0)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held. */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(path->dentry,
+                                                           false, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (mounted) {
+                                dput(path->dentry);
+                                if (need_mntput)
+                                        mntput(path->mnt);
+                                path->mnt = mounted;
+                                path->dentry = dget(mounted->mnt_root);
+                                need_mntput = true;
+                                continue;
+                        }
+                        /* Something is mounted on this dentry in another
+                         * namespace and/or whatever was mounted there in this
+                         * namespace got unmounted before we managed to get the
+                         * vfsmount_lock */
+                }
+                /* Handle an automount point */
+                if (managed & DCACHE_NEED_AUTOMOUNT) {
+                        ret = follow_automount(path, flags, &need_mntput);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                        continue;
+                }
+                /* We didn't change the current path point */
+                break;
+        }
+        return 0;
+}
+int follow_down_one(struct path *path)
+{
+        struct vfsmount *mounted;
+        mounted = lookup_mnt(path);
+        if (mounted) {
+                dput(path->dentry);
+                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
-                res = 1;
+                return 1;
        }
-        return res;
+        return 0;
 }
-static void follow_mount(struct path *path)
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet a managed dentry and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+                               struct inode **inode, bool reverse_transit)
 {
        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(path);
+                struct vfsmount *mounted;
+                if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                    !reverse_transit &&
+                    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+                        return false;
+                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
-                dput(path->dentry);
-                mntput(path->mnt);
                path->mnt = mounted;
-                path->dentry = dget(mounted->mnt_root);
+                path->dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+                *inode = path->dentry->d_inode;
+        }
+        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                return reverse_transit;
+        return true;
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+        struct inode *inode = nd->inode;
+        set_root_rcu(nd);
+        while (1) {
+                if (nd->path.dentry == nd->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
+                        break;
+                }
+                if (nd->path.dentry != nd->path.mnt->mnt_root) {
+                        struct dentry *old = nd->path.dentry;
+                        struct dentry *parent = old->d_parent;
+                        unsigned seq;
+                        seq = read_seqcount_begin(&parent->d_seq);
+                        if (read_seqcount_retry(&old->d_seq, nd->seq))
+                                return -ECHILD;
+                        inode = parent->d_inode;
+                        nd->path.dentry = parent;
+                        nd->seq = seq;
+                        break;
+                }
+                if (!follow_up_rcu(&nd->path))
+                        break;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+                inode = nd->path.dentry->d_inode;
        }
+        __follow_mount_rcu(nd, &nd->path, &inode, true);
+        nd->inode = inode;
+        return 0;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
 */
-int follow_down(struct path *path)
+int follow_down(struct path *path, bool mounting_here)
 {
-        struct vfsmount *mounted;
+        unsigned managed;
+        int ret;
-        mounted = lookup_mnt(path);
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
-        if (mounted) {
+               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held.
+                 *
+                 * We indicate to the filesystem if someone is trying to mount
+                 * something here.  This gives autofs the chance to deny anyone
+                 * other than its daemon the right to mount on its
+                 * superstructure.
+                 *
+                 * The filesystem may sleep at this point.
+                 */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(
+                                path->dentry, mounting_here, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (!mounted)
+                                break;
+                        dput(path->dentry);
+                        mntput(path->mnt);
+                        path->mnt = mounted;
+                        path->dentry = dget(mounted->mnt_root);
+                        continue;
+                }
+                /* Don't handle automount points here */
+                break;
+        }
+        return 0;
+}
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted = lookup_mnt(path);
+                if (!mounted)
+                        break;
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
-                return 1;
        }
-        return 0;
 }
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static void follow_dotdot(struct nameidata *nd)
 {
        set_root(nd);
@@ -684,6 +1176,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                        break;
        }
        follow_mount(&nd->path);
+        nd->inode = nd->path.dentry->d_inode;
 }
 /*
@@ -721,17 +1214,19 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
 *  It _is_ time-critical.
 */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-                     struct path *path)
+                        struct path *path, struct inode **inode)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry, *parent;
+        struct dentry *dentry, *parent = nd->path.dentry;
        struct inode *dir;
+        int err;
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                err = parent->d_op->d_hash(parent, nd->inode, name);
                if (err < 0)
                        return err;
        }
@@ -741,21 +1236,52 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
-        dentry = __d_lookup(nd->path.dentry, name);
+        if (nd->flags & LOOKUP_RCU) {
+                unsigned seq;
+                *inode = nd->inode;
+                dentry = __d_lookup_rcu(parent, name, &seq, inode);
+                if (!dentry) {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                        goto need_lookup;
+                }
+                /* Memory barrier in read_seqcount_begin of child is enough */
+                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                        return -ECHILD;
+                nd->seq = seq;
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                        goto need_revalidate;
+done2:
+                path->mnt = mnt;
+                path->dentry = dentry;
+                if (likely(__follow_mount_rcu(nd, path, inode, false)))
+                        return 0;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+                /* fallthru */
+        }
+        dentry = __d_lookup(parent, name);
        if (!dentry)
                goto need_lookup;
 found:
-        if (dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
                goto need_revalidate;
 done:
        path->mnt = mnt;
        path->dentry = dentry;
-        __follow_mount(path);
+        err = follow_managed(path, nd->flags);
+        if (unlikely(err < 0)) {
+                path_put_conditional(path, nd);
+                return err;
+        }
+        *inode = path->dentry->d_inode;
        return 0;
 need_lookup:
-        parent = nd->path.dentry;
        dir = parent->d_inode;
+        BUG_ON(nd->inode != dir);
        mutex_lock(&dir->i_mutex);
        /*
@@ -789,6 +1315,8 @@ need_revalidate:
                goto need_lookup;
        if (IS_ERR(dentry))
                goto fail;
+        if (nd->flags & LOOKUP_RCU)
+                goto done2;
        goto done;
 fail:
@@ -796,17 +1324,6 @@ fail:
 }
 /*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT.  To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
-        return inode && unlikely(inode->i_op->follow_link) &&
-                ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
@@ -817,7 +1334,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
-        struct inode *inode;
        int err;
        unsigned int lookup_flags = nd->flags;
        
@@ -826,18 +1342,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                goto return_reval;
-        inode = nd->path.dentry->d_inode;
        if (nd->depth)
                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
+                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission(inode);
+                if (nd->flags & LOOKUP_RCU) {
+                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                        if (err == -ECHILD) {
+                                if (nameidata_drop_rcu(nd))
+                                        return -ECHILD;
+                                goto exec_again;
+                        }
+                } else {
+exec_again:
+                        err = exec_permission(nd->inode, 0);
+                }
                if (err)
                        break;
@@ -868,37 +1394,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                continue;
                }
                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
                err = -ENOENT;
-                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
                if (inode->i_op->follow_link) {
+                        /* We commonly drop rcu-walk here */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
+                        nd->inode = nd->path.dentry->d_inode;
                        err = -ENOENT;
-                        inode = nd->path.dentry->d_inode;
+                        if (!nd->inode)
-                        if (!inode)
                                break;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOTDIR; 
-                if (!inode->i_op->lookup)
+                if (!nd->inode->i_op->lookup)
                        break;
                continue;
                /* here ends the main loop */
@@ -913,32 +1446,40 @@ last_component:
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                goto return_reval;
                }
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
-                inode = next.dentry->d_inode;
+                if (inode && unlikely(inode->i_op->follow_link) &&
-                if (follow_on_final(inode, lookup_flags)) {
+                    (lookup_flags & LOOKUP_FOLLOW)) {
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
-                        inode = nd->path.dentry->d_inode;
+                        nd->inode = nd->path.dentry->d_inode;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOENT;
-                if (!inode)
+                if (!nd->inode)
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op->lookup)
+                        if (!nd->inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -958,25 +1499,43 @@ return_reval:
                 * We bypassed the ordinary revalidation routines.
                 * We may need to check the cached dentry for staleness.
                 */
-                if (nd->path.dentry && nd->path.dentry->d_sb &&
+                if (need_reval_dot(nd->path.dentry)) {
-                    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
-                        err = -ESTALE;
                        /* Note: we do not d_invalidate() */
-                        if (!nd->path.dentry->d_op->d_revalidate(
+                        err = d_revalidate(nd->path.dentry, nd);
-                                        nd->path.dentry, nd))
+                        if (!err)
+                                err = -ESTALE;
+                        if (err < 0)
                                break;
                }
 return_base:
+                if (nameidata_drop_rcu_last_maybe(nd))
+                        return -ECHILD;
                return 0;
 out_dput:
-                path_put_conditional(&next, nd);
+                if (!(nd->flags & LOOKUP_RCU))
+                        path_put_conditional(&next, nd);
                break;
        }
-        path_put(&nd->path);
+        if (!(nd->flags & LOOKUP_RCU))
+                path_put(&nd->path);
 return_err:
        return err;
 }
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
 static int path_walk(const char *name, struct nameidata *nd)
 {
        struct path save = nd->path;
@@ -1002,6 +1561,93 @@ static int path_walk(const char *name, struct nameidata *nd)
        return result;
 }
+static void path_finish_rcu(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                /* RCU dangling. Cancel it. */
+                nd->flags &= ~LOOKUP_RCU;
+                nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
+        if (nd->file)
+                fput(nd->file);
+}
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+        int retval = 0;
+        int fput_needed;
+        struct file *file;
+        nd->last_type = LAST_ROOT; /* if there are only slashes... */
+        nd->flags = flags | LOOKUP_RCU;
+        nd->depth = 0;
+        nd->root.mnt = NULL;
+        nd->file = NULL;
+        if (*name=='/') {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                        nd->path = nd->root;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->path = fs->pwd;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else {
+                struct dentry *dentry;
+                file = fget_light(dfd, &fput_needed);
+                retval = -EBADF;
+                if (!file)
+                        goto out_fail;
+                dentry = file->f_path.dentry;
+                retval = -ENOTDIR;
+                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        goto fput_fail;
+                retval = file_permission(file, MAY_EXEC);
+                if (retval)
+                        goto fput_fail;
+                nd->path = file->f_path;
+                if (fput_needed)
+                        nd->file = file;
+                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+        }
+        nd->inode = nd->path.dentry->d_inode;
+        return 0;
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
@@ -1042,6 +1688,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                fput_light(file, fput_needed);
        }
+        nd->inode = nd->path.dentry->d_inode;
        return 0;
 fput_fail:
@@ -1054,16 +1701,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_init(dfd, name, flags, nd);
+        int retval;
-        if (!retval)
-                retval = path_walk(name, nd);
+        /*
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+         * Path walking is largely split up into 2 different synchronisation
-                                nd->path.dentry->d_inode))
+         * schemes, rcu-walk and ref-walk (explained in
-                audit_inode(name, nd->path.dentry);
+         * Documentation/filesystems/path-lookup.txt). These share much of the
+         * path walk code, but some things particularly setup, cleanup, and
+         * following mounts are sufficiently divergent that functions are
+         * duplicated. Typically there is a function foo(), and its RCU
+         * analogue, foo_rcu().
+         *
+         * -ECHILD is the error number of choice (just to avoid clashes) that
+         * is returned if some aspect of an rcu-walk fails. Such an error must
+         * be handled by restarting a traditional ref-walk (which will always
+         * be able to complete).
+         */
+        retval = path_init_rcu(dfd, name, flags, nd);
+        if (unlikely(retval))
+                return retval;
+        retval = path_walk_rcu(name, nd);
+        path_finish_rcu(nd);
        if (nd->root.mnt) {
                path_put(&nd->root);
                nd->root.mnt = NULL;
        }
+        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+                /* slower, locked walk */
+                if (retval == -ESTALE)
+                        flags |= LOOKUP_REVAL;
+                retval = path_init(dfd, name, flags, nd);
+                if (unlikely(retval))
+                        return retval;
+                retval = path_walk(name, nd);
+                if (nd->root.mnt) {
+                        path_put(&nd->root);
+                        nd->root.mnt = NULL;
+                }
+        }
+        if (likely(!retval)) {
+                if (unlikely(!audit_dummy_context())) {
+                        if (nd->path.dentry && nd->inode)
+                                audit_inode(name, nd->path.dentry);
+                }
+        }
        return retval;
 }
@@ -1106,10 +1790,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        path_get(&nd->path);
        nd->root = nd->path;
        path_get(&nd->root);
+        nd->inode = nd->path.dentry->d_inode;
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->path.dentry->d_inode))
+                                nd->inode))
                audit_inode(name, nd->path.dentry);
        path_put(&nd->root);
@@ -1125,7 +1810,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        struct dentry *dentry;
        int err;
-        err = exec_permission(inode);
+        err = exec_permission(inode, 0);
        if (err)
                return ERR_PTR(err);
@@ -1133,8 +1818,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (base->d_op && base->d_op->d_hash) {
+        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, name);
+                err = base->d_op->d_hash(base, inode, name);
                dentry = ERR_PTR(err);
                if (err < 0)
                        goto out;
@@ -1147,7 +1832,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
         */
        dentry = d_lookup(base, name);
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
                dentry = do_revalidate(dentry, nd);
        if (!dentry)
@@ -1448,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
        return break_lease(inode, flag);
 }
-static int handle_truncate(struct path *path)
+static int handle_truncate(struct file *filp)
 {
+        struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
@@ -1463,7 +2149,7 @@ static int handle_truncate(struct path *path)
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-                                    NULL);
+                                    filp);
        }
        put_write_access(inode);
        return error;
@@ -1490,6 +2176,7 @@ out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
@@ -1560,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
        }
        if (!IS_ERR(filp)) {
                if (will_truncate) {
-                        error = handle_truncate(&nd->path);
+                        error = handle_truncate(filp);
                        if (error) {
                                fput(filp);
                                filp = ERR_PTR(error);
@@ -1584,6 +2271,9 @@ exit:
        return ERR_PTR(error);
 }
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
                            int mode, const char *pathname)
@@ -1597,50 +2287,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                follow_dotdot(nd);
                dir = nd->path.dentry;
        case LAST_DOT:
-                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+                if (need_reval_dot(dir)) {
-                        if (!dir->d_op->d_revalidate(dir, nd)) {
+                        int status = d_revalidate(nd->path.dentry, nd);
-                                error = -ESTALE;
+                        if (!status)
+                                status = -ESTALE;
+                        if (status < 0) {
+                                error = status;
                                goto exit;
                        }
                }
                /* fallthrough */
        case LAST_ROOT:
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                /* fallthrough */
        case LAST_BIND:
                audit_inode(pathname, dir);
                goto ok;
        }
        /* trailing slashes? */
-        if (nd->last.name[nd->last.len]) {
+        if (nd->last.name[nd->last.len])
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-        }
-        /* just plain open? */
-        if (!(open_flag & O_CREAT)) {
-                error = do_lookup(nd, &nd->last, path);
-                if (error)
-                        goto exit;
-                error = -ENOENT;
-                if (!path->dentry->d_inode)
-                        goto exit_dput;
-                if (path->dentry->d_inode->i_op->follow_link)
-                        return NULL;
-                error = -ENOTDIR;
-                if (nd->flags & LOOKUP_DIRECTORY) {
-                        if (!path->dentry->d_inode->i_op->lookup)
-                                goto exit_dput;
-                }
-                path_to_nameidata(path, nd);
-                audit_inode(pathname, nd->path.dentry);
-                goto ok;
-        }
-        /* OK, it's O_CREAT */
        mutex_lock(&dir->d_inode->i_mutex);
        path->dentry = lookup_hash(nd);
@@ -1697,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (open_flag & O_EXCL)
                goto exit_dput;
-        if (__follow_mount(path)) {
+        error = follow_managed(path, nd->flags);
-                error = -ELOOP;
+        if (error < 0)
-                if (open_flag & O_NOFOLLOW)
+                goto exit_dput;
-                        goto exit_dput;
-        }
        error = -ENOENT;
        if (!path->dentry->d_inode)
@@ -1711,8 +2376,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                return NULL;
        path_to_nameidata(path, nd);
+        nd->inode = path->dentry->d_inode;
        error = -EISDIR;
-        if (S_ISDIR(path->dentry->d_inode->i_mode))
+        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
        filp = finish_open(nd, open_flag, acc_mode);
@@ -1743,11 +2409,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct path path;
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
-        int force_reval = 0;
+        int flags;
        if (!(open_flag & O_CREAT))
                mode = 0;
+        /* Must never be set by userspace */
+        open_flag &= ~FMODE_NONOTIFY;
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
@@ -1769,54 +2438,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (open_flag & O_APPEND)
                acc_mode |= MAY_APPEND;
-        /* find the parent */
+        flags = LOOKUP_OPEN;
-reval:
+        if (open_flag & O_CREAT) {
-        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+                flags |= LOOKUP_CREATE;
+                if (open_flag & O_EXCL)
+                        flags |= LOOKUP_EXCL;
+        }
+        if (open_flag & O_DIRECTORY)
+                flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        filp = get_empty_filp();
+        if (!filp)
+                return ERR_PTR(-ENFILE);
+        filp->f_flags = open_flag;
+        nd.intent.open.file = filp;
+        nd.intent.open.flags = flag;
+        nd.intent.open.create_mode = mode;
+        if (open_flag & O_CREAT)
+                goto creat;
+        /* !O_CREAT, simple open */
+        error = do_path_lookup(dfd, pathname, flags, &nd);
+        if (unlikely(error))
+                goto out_filp;
+        error = -ELOOP;
+        if (!(nd.flags & LOOKUP_FOLLOW)) {
+                if (nd.inode->i_op->follow_link)
+                        goto out_path;
+        }
+        error = -ENOTDIR;
+        if (nd.flags & LOOKUP_DIRECTORY) {
+                if (!nd.inode->i_op->lookup)
+                        goto out_path;
+        }
+        audit_inode(pathname, nd.path.dentry);
+        filp = finish_open(&nd, open_flag, acc_mode);
+        return filp;
+creat:
+        /* OK, have to create the file. Find the parent. */
+        error = path_init_rcu(dfd, pathname,
+                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
        if (error)
-                return ERR_PTR(error);
+                goto out_filp;
-        if (force_reval)
+        error = path_walk_rcu(pathname, &nd);
-                nd.flags |= LOOKUP_REVAL;
+        path_finish_rcu(&nd);
+        if (unlikely(error == -ECHILD || error == -ESTALE)) {
+                /* slower, locked walk */
+                if (error == -ESTALE) {
+reval:
+                        flags |= LOOKUP_REVAL;
+                }
+                error = path_init(dfd, pathname,
+                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+                if (error)
+                        goto out_filp;
-        current->total_link_count = 0;
+                error = path_walk_simple(pathname, &nd);
-        error = link_path_walk(pathname, &nd);
-        if (error) {
-                filp = ERR_PTR(error);
-                goto out;
        }
-        if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+        if (unlikely(error))
+                goto out_filp;
+        if (unlikely(!audit_dummy_context()))
                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component.
         */
+        nd.flags = flags;
-        error = -ENFILE;
-        filp = get_empty_filp();
-        if (filp == NULL)
-                goto exit_parent;
-        nd.intent.open.file = filp;
-        filp->f_flags = open_flag;
-        nd.intent.open.flags = flag;
-        nd.intent.open.create_mode = mode;
-        nd.flags &= ~LOOKUP_PARENT;
-        nd.flags |= LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                nd.flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        nd.flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                nd.flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                nd.flags |= LOOKUP_FOLLOW;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
-                struct path holder;
+                struct path link = path;
-                struct inode *inode = path.dentry->d_inode;
+                struct inode *linki = link.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
-                /* S_ISDIR part is a temporary automount kludge */
+                if (!(nd.flags & LOOKUP_FOLLOW))
-                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1832,41 +2531,37 @@ reval:
                 * just set LAST_BIND.
                 */
                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(path.dentry, &nd);
+                error = security_inode_follow_link(link.dentry, &nd);
                if (error)
                        goto exit_dput;
-                error = __do_follow_link(&path, &nd, &cookie);
+                error = __do_follow_link(&link, &nd, &cookie);
                if (unlikely(error)) {
+                        if (!IS_ERR(cookie) && linki->i_op->put_link)
+                                linki->i_op->put_link(link.dentry, &nd, cookie);
                        /* nd.path had been dropped */
-                        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                        nd.path = link;
-                                inode->i_op->put_link(path.dentry, &nd, cookie);
+                        goto out_path;
-                        path_put(&path);
-                        release_open_intent(&nd);
-                        filp = ERR_PTR(error);
-                        goto out;
                }
-                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                if (inode->i_op->put_link)
+                if (linki->i_op->put_link)
-                        inode->i_op->put_link(holder.dentry, &nd, cookie);
+                        linki->i_op->put_link(link.dentry, &nd, cookie);
-                path_put(&holder);
+                path_put(&link);
        }
 out:
        if (nd.root.mnt)
                path_put(&nd.root);
-        if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-                force_reval = 1;
                goto reval;
-        }
        return filp;
 exit_dput:
        path_put_conditional(&path, &nd);
+out_path:
+        path_put(&nd.path);
+out_filp:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
-exit_parent:
-        path_put(&nd.path);
        filp = ERR_PTR(error);
        goto out;
 }
@@ -2127,12 +2822,10 @@ void dentry_unhash(struct dentry *dentry)
 {
        dget(dentry);
        shrink_dcache_parent(dentry);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) == 2)
+        if (dentry->d_count == 2)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2881,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e5..7b0b9537169 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
        mnt->mnt_group_id = 0;
 }
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+        preempt_disable();
+        mnt->mnt_count += n;
+        preempt_enable();
+#endif
+}
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+        mnt->mnt_count = n;
+#endif
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, 1);
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, -1);
+}
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+        }
+        return count;
+#else
+        return mnt->mnt_count;
+#endif
+}
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                                goto out_free_id;
                }
-                atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+                if (!mnt->mnt_pcp)
+                        goto out_free_devname;
+                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#else
+                mnt->mnt_count = 1;
+                mnt->mnt_writers = 0;
+#endif
                INIT_LIST_HEAD(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
-#ifdef CONFIG_SMP
-                mnt->mnt_writers = alloc_percpu(int);
-                if (!mnt->mnt_writers)
-                        goto out_free_devname;
-#else
-                mnt->mnt_writers = 0;
-#endif
        }
        return mnt;
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers++;
 #endif
 }
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers--;
 #endif
 }
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;
        for_each_possible_cpu(cpu) {
-                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }
        return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
        int ret = 0;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        /*
-         * The store to inc_mnt_writers must be visible before we pass
+         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
         */
        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
-                dec_mnt_writers(mnt);
+                mnt_dec_writers(mnt);
                ret = -EROFS;
                goto out;
        }
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        preempt_enable();
        return 0;
 }
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
 void mnt_drop_write(struct vfsmount *mnt)
 {
        preempt_disable();
-        dec_mnt_writers(mnt);
+        mnt_dec_writers(mnt);
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
-        if (count_mnt_writers(mnt) > 0)
+        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_writers);
+        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 /*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+        unsigned u;
+        for (u = 0; u < HASH_SIZE; u++) {
+                struct vfsmount *p;
+                list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+                        if (p->mnt_mountpoint == dentry)
+                                return;
+                }
+        }
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags &= ~DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
+}
+/*
 * vfsmount lock must be held for write
 */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        mnt->mnt_mountpoint = mnt->mnt_root;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_hash);
-        old_path->dentry->d_mounted--;
+        dentry_reset_mounted(old_path->mnt, old_path->dentry);
 }
 /*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 {
        child_mnt->mnt_parent = mntget(mnt);
        child_mnt->mnt_mountpoint = dget(dentry);
-        dentry->d_mounted++;
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags |= DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -528,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
        list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_dec(&mnt->mnt_longterm);
+#endif
+}
 /*
 * vfsmount lock must be held for write
 */
@@ -541,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
        BUG_ON(parent == mnt);
        list_add_tail(&head, &mnt->mnt_list);
-        list_for_each_entry(m, &head, mnt_list)
+        list_for_each_entry(m, &head, mnt_list) {
                m->mnt_ns = n;
+                __mnt_make_longterm(m);
+        }
        list_splice(&head, n->list.prev);
        list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -630,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
        return NULL;
 }
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
 {
        struct super_block *sb = mnt->mnt_sb;
        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
@@ -640,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt)
         * to make r/w->r/o transitions.
         */
        /*
-         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * The locking used to deal with mnt_count decrement provides barriers,
-         * provides barriers, so count_mnt_writers() below is safe.  AV
+         * so mnt_get_writers() below is safe.
         */
-        WARN_ON(count_mnt_writers(mnt));
+        WARN_ON(mnt_get_writers(mnt));
        fsnotify_vfsmount_delete(mnt);
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
 }
-void mntput_no_expire(struct vfsmount *mnt)
+static void mntput_no_expire(struct vfsmount *mnt)
 {
-repeat:
+put_again:
-        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+#ifdef CONFIG_SMP
+        br_read_lock(vfsmount_lock);
+        if (likely(atomic_read(&mnt->mnt_longterm))) {
+                mnt_dec_count(mnt);
+                br_read_unlock(vfsmount_lock);
                return;
+        }
+        br_read_unlock(vfsmount_lock);
        br_write_lock(vfsmount_lock);
-        if (!atomic_dec_and_test(&mnt->mnt_count)) {
+        mnt_dec_count(mnt);
+        if (mnt_get_count(mnt)) {
                br_write_unlock(vfsmount_lock);
                return;
        }
-        if (likely(!mnt->mnt_pinned)) {
+#else
-                br_write_unlock(vfsmount_lock);
+        mnt_dec_count(mnt);
-                __mntput(mnt);
+        if (likely(mnt_get_count(mnt)))
                return;
+        br_write_lock(vfsmount_lock);
+#endif
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
+                br_write_unlock(vfsmount_lock);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
        }
-        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-        mnt->mnt_pinned = 0;
        br_write_unlock(vfsmount_lock);
-        acct_auto_close_mnt(mnt);
+        mntfree(mnt);
-        goto repeat;
 }
-EXPORT_SYMBOL(mntput_no_expire);
+void mntput(struct vfsmount *mnt)
+{
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                mntput_no_expire(mnt);
+        }
+}
+EXPORT_SYMBOL(mntput);
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+        if (mnt)
+                mnt_inc_count(mnt);
+        return mnt;
+}
+EXPORT_SYMBOL(mntget);
 void mnt_pin(struct vfsmount *mnt)
 {
@@ -679,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt)
        mnt->mnt_pinned++;
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
-                atomic_inc(&mnt->mnt_count);
+                mnt_inc_count(mnt);
                mnt->mnt_pinned--;
        }
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        br_read_lock(vfsmount_lock);
+        /* write lock needed for mnt_get_count */
+        br_write_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                actual_refs += atomic_read(&p->mnt_count);
+                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -1018,10 +1150,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        br_read_lock(vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1058,26 +1190,29 @@ void release_mounts(struct list_head *head)
 */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+        LIST_HEAD(tmp_list);
        struct vfsmount *p;
        for (p = mnt; p; p = next_mnt(p, mnt))
-                list_move(&p->mnt_hash, kill);
+                list_move(&p->mnt_hash, &tmp_list);
        if (propagate)
-                propagate_umount(kill);
+                propagate_umount(&tmp_list);
-        list_for_each_entry(p, kill, mnt_hash) {
+        list_for_each_entry(p, &tmp_list, mnt_hash) {
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
+                __mnt_make_shortterm(p);
                list_del_init(&p->mnt_child);
                if (p->mnt_parent != p) {
                        p->mnt_parent->mnt_ghosts++;
-                        p->mnt_mountpoint->d_mounted--;
+                        dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
+        list_splice(&tmp_list, kill);
 }
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1103,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;
-                if (atomic_read(&mnt->mnt_count) != 2)
+                /*
+                 * probably don't strictly need the lock here if we examined
+                 * all race cases, but it's a slowpath.
+                 */
+                br_write_lock(vfsmount_lock);
+                if (mnt_get_count(mnt) != 2) {
+                        br_write_lock(vfsmount_lock);
                        return -EBUSY;
+                }
+                br_write_unlock(vfsmount_lock);
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1668,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
                return err;
        down_write(&namespace_sem);
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto out;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
                goto out;
@@ -1728,6 +1872,8 @@ out:
        return err;
 }
+static int do_add_mount(struct vfsmount *, struct path *, int);
 /*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
@@ -1736,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
                        int mnt_flags, char *name, void *data)
 {
        struct vfsmount *mnt;
+        int err;
        if (!type)
                return -EINVAL;
@@ -1748,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
-        return do_add_mount(mnt, path, mnt_flags, NULL);
+        err = do_add_mount(mnt, path, mnt_flags);
+        if (err)
+                mntput(mnt);
+        return err;
+}
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+        int err;
+        /* The new mount record should have at least 2 refs to prevent it being
+         * expired before we get a chance to add it
+         */
+        BUG_ON(mnt_get_count(m) < 2);
+        if (m->mnt_sb == path->mnt->mnt_sb &&
+            m->mnt_root == path->dentry) {
+                err = -ELOOP;
+                goto fail;
+        }
+        err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+        if (!err)
+                return 0;
+fail:
+        /* remove m from any expiration list it may be on */
+        if (!list_empty(&m->mnt_expire)) {
+                down_write(&namespace_sem);
+                br_write_lock(vfsmount_lock);
+                list_del_init(&m->mnt_expire);
+                br_write_unlock(vfsmount_lock);
+                up_write(&namespace_sem);
+        }
+        mntput(m);
+        mntput(m);
+        return err;
 }
 /*
 * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
 */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-                 int mnt_flags, struct list_head *fslist)
 {
        int err;
@@ -1764,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto unlock;
        err = -EINVAL;
        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
@@ -1782,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
                goto unlock;
        newmnt->mnt_flags = mnt_flags;
-        if ((err = graft_tree(newmnt, path)))
+        err = graft_tree(newmnt, path);
-                goto unlock;
-        if (fslist) /* add to the specified expiration list */
-                list_add_tail(&newmnt->mnt_expire, fslist);
-        up_write(&namespace_sem);
-        return 0;
 unlock:
        up_write(&namespace_sem);
-        mntput(newmnt);
        return err;
 }
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+        down_write(&namespace_sem);
+        br_write_lock(vfsmount_lock);
+        list_add_tail(&mnt->mnt_expire, expiry_list);
+        br_write_unlock(vfsmount_lock);
+        up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
 /*
 * process a list of expirable mountpoints with the intent of discarding any
@@ -2086,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
        return new_ns;
 }
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+        __mnt_make_longterm(mnt);
+}
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+                return;
+        br_write_lock(vfsmount_lock);
+        atomic_dec(&mnt->mnt_longterm);
+        br_write_unlock(vfsmount_lock);
+#endif
+}
 /*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
@@ -2123,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        q = new_ns->root;
        while (p) {
                q->mnt_ns = new_ns;
+                __mnt_make_longterm(q);
                if (fs) {
                        if (p == fs->root.mnt) {
-                                rootmnt = p;
                                fs->root.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
+                                rootmnt = p;
                        }
                        if (p == fs->pwd.mnt) {
-                                pwdmnt = p;
                                fs->pwd.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
+                                pwdmnt = p;
                        }
                }
                p = next_mnt(p, mnt_ns->root);
@@ -2174,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
        new_ns = alloc_mnt_ns();
        if (!IS_ERR(new_ns)) {
                mnt->mnt_ns = new_ns;
+                __mnt_make_longterm(mnt);
                new_ns->root = mnt;
                list_add(&new_ns->list, &new_ns->root->mnt_list);
        }
@@ -2328,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2354,6 +2564,7 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
        ns = create_mnt_ns(mnt);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919..f6946bb5cb5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,13 +17,11 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
-#include <linux/smp_lock.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static void ncp_read_volume_list(struct file *, void *, filldir_t,
                                struct ncp_cache_control *);
@@ -75,11 +73,14 @@ const struct inode_operations ncp_dir_inode_operations =
 * Dentry operations routines
 */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+                struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+                const struct dentry *, const struct inode *,
-static const struct dentry_operations ncp_dentry_operations =
+                unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
+const struct dentry_operations ncp_dentry_operations =
 {
        .d_revalidate   = ncp_lookup_validate,
        .d_hash         = ncp_hash_dentry,
@@ -87,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
        .d_delete       = ncp_delete_dentry,
 };
-const struct dentry_operations ncp_root_dentry_operations =
-{
-        .d_hash         = ncp_hash_dentry,
-        .d_compare      = ncp_compare_dentry,
-        .d_delete       = ncp_delete_dentry,
-};
 #define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
 static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -114,10 +107,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 #define ncp_preserve_case(i)    (ncp_namespace(i) != NW_NS_DOS)
-static inline int ncp_case_sensitive(struct dentry *dentry)
+static inline int ncp_case_sensitive(const struct inode *i)
 {
 #ifdef CONFIG_NCPFS_NFS_NS
-        return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+        return ncp_namespace(i) == NW_NS_NFS;
 #else
        return 0;
 #endif /* CONFIG_NCPFS_NFS_NS */
@@ -128,14 +121,16 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
 * is case-sensitive.
 */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
-        if (!ncp_case_sensitive(dentry)) {
+        if (!ncp_case_sensitive(inode)) {
+                struct super_block *sb = dentry->d_sb;
                struct nls_table *t;
                unsigned long hash;
                int i;
-                t = NCP_IO_TABLE(dentry);
+                t = NCP_IO_TABLE(sb);
                hash = init_name_hash();
                for (i=0; i<this->len ; i++)
                        hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -146,15 +141,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        if (a->len != b->len)
+        if (len != name->len)
                return 1;
-        if (ncp_case_sensitive(dentry))
+        if (ncp_case_sensitive(pinode))
-                return strncmp(a->name, b->name, a->len);
+                return strncmp(str, name->name, len);
-        return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+        return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 /*
@@ -163,7 +160,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 * Closing files can be safely postponed until iput() - it's done there anyway.
 */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -302,6 +299,12 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
        int res, val = 0, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
+        if (dentry == dentry->d_sb->s_root)
+                return 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -385,21 +388,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
        }
        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dent = list_entry(next, struct dentry, d_u.d_child);
                if ((unsigned long)dent->d_fsdata == fpos) {
                        if (dent->d_inode)
-                                dget_locked(dent);
+                                dget(dent);
                        else
                                dent = NULL;
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                        goto out;
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return NULL;
 out:
@@ -593,7 +596,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        qname.hash = full_name_hash(qname.name, qname.len);
        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, &qname) != 0)
+                if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
                        goto end_advance;
        newdent = d_lookup(dentry, &qname);
@@ -612,35 +615,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                        shrink_dcache_parent(newdent);
                /*
-                 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+                 * NetWare's OS2 namespace is case preserving yet case
-                 * case preserving yet case insensitive.  So we update dentry's name
+                 * insensitive.  So we update dentry's name as received from
-                 * as received from server.  We found dentry via d_lookup with our
+                 * server. Parent dir's i_mutex is locked because we're in
-                 * hash, so we know that hash does not change, and so replacing name
+                 * readdir.
-                 * should be reasonably safe.
                 */
-                if (qname.len == newdent->d_name.len &&
+                dentry_update_name_case(newdent, &qname);
-                    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
-                        struct inode *inode = newdent->d_inode;
-                        /*
-                         * Inside ncpfs all uses of d_name are either for debugging,
-                         * or on functions which acquire inode mutex (mknod, creat,
-                         * lookup).  So grab i_mutex here, to be sure.  d_path
-                         * uses dcache_lock when generating path, so we should too.
-                         * And finally d_compare is protected by dentry's d_lock, so
-                         * here we go.
-                         */
-                        if (inode)
-                                mutex_lock(&inode->i_mutex);
-                        spin_lock(&dcache_lock);
-                        spin_lock(&newdent->d_lock);
-                        memcpy((char *) newdent->d_name.name, qname.name,
-                                                                newdent->d_name.len);
-                        spin_unlock(&newdent->d_lock);
-                        spin_unlock(&dcache_lock);
-                        if (inode)
-                                mutex_unlock(&inode->i_mutex);
-                }
        }
        if (!newdent->d_inode) {
@@ -650,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                entry->ino = iunique(dir->i_sb, 2);
                inode = ncp_iget(dir->i_sb, entry);
                if (inode) {
-                        newdent->d_op = &ncp_dentry_operations;
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
@@ -658,7 +637,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        } else {
                struct inode *inode = newdent->d_inode;
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ncp_update_inode2(inode, entry);
                mutex_unlock(&inode->i_mutex);
        }
@@ -906,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        if (inode) {
                ncp_new_dentry(dentry);
 add_entry:
-                dentry->d_op = &ncp_dentry_operations;
                d_add(dentry, inode);
                error = 0;
        }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c52..0ed65e0c3df 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,10 +17,8 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static int ncp_fsync(struct file *file, int datasync)
 {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c..00a1d1c3d3a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,16 +26,14 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
-#include <linux/ncp_fs.h>
 #include <net/sock.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 #include "getopt.h"
 #define NCP_DEFAULT_FILE_MODE 0600
@@ -59,11 +57,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
+static void ncp_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ncp_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -310,7 +315,12 @@ static void ncp_stop_tasks(struct ncp_server *server) {
        sk->sk_write_space  = server->write_space;
        release_sock(sk);
        del_timer_sync(&server->timeout_tm);
-        flush_scheduled_work();
+        flush_work_sync(&server->rcv.tq);
+        if (sk->sk_socket->type == SOCK_STREAM)
+                flush_work_sync(&server->tx.tq);
+        else
+                flush_work_sync(&server->timeout_tq);
 }
 static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -532,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_d_op = &ncp_dentry_operations;
        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
@@ -711,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto out_no_root;
-        sb->s_root->d_op = &ncp_root_dentry_operations;
        return 0;
 out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c..790e92a9ec6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,15 +17,12 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/ncp_fs.h>
 #include <asm/uaccess.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* maximum limit for ncp_objectname_ioctl */
 #define NCP_OBJECT_NAME_MAX_LEN 4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1e..a7c07b44b10 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include "ncp_fs.h"
 /*
 * Fill in the supplied page for mmap
 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 00000000000..31831afe1c3
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+struct ncp_entry_info {
+        struct nw_info_struct   i;
+        ino_t                   ino;
+        int                     opened;
+        int                     access;
+        unsigned int            volume;
+        __u8                    file_handle[6];
+};
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+#define NCP_SERVER(inode)       NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+        return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+        void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+        return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 00000000000..4b0bec47784
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+        __le32  dirEntNum;
+        __le32  DosDirNum;
+        __u8    volNumber;
+        __le32  nwattr;
+        struct mutex open_mutex;
+        atomic_t        opened;
+        int     access;
+        int     flags;
+#define NCPI_KLUDGE_SYMLINK     0x0001
+        __u8    file_handle[6];
+        struct inode vfs_inode;
+};
+#endif  /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 00000000000..4af803f1351
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+#define NCP_DEFAULT_OPTIONS 0           /* 2 for packet signatures */
+struct sock;
+struct ncp_mount_data_kernel {
+        unsigned long    flags;         /* NCP_MOUNT_* flags */
+        unsigned int     int_flags;     /* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE    0x0001
+        __kernel_uid32_t mounted_uid;   /* Who may umount() this filesystem? */
+        struct pid      *wdog_pid;      /* Who cares for our watchdog packets? */
+        unsigned int     ncp_fd;        /* The socket to the ncp port */
+        unsigned int     time_out;      /* How long should I wait after
+                                           sending a NCP request? */
+        unsigned int     retry_count;   /* And how often should I retry? */
+        unsigned char    mounted_vol[NCP_VOLNAME_LEN + 1];
+        __kernel_uid32_t uid;
+        __kernel_gid32_t gid;
+        __kernel_mode_t  file_mode;
+        __kernel_mode_t  dir_mode;
+        int              info_fd;
+};
+struct ncp_server {
+        struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
+                                           interest for us later, so we store
+                                           it completely. */
+        __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+        struct file *ncp_filp;  /* File pointer to ncp socket */
+        struct socket *ncp_sock;/* ncp socket */
+        struct file *info_filp;
+        struct socket *info_sock;
+        u8 sequence;
+        u8 task;
+        u16 connection;         /* Remote connection number */
+        u8 completion;          /* Status message from server */
+        u8 conn_status;         /* Bit 4 = 1 ==> Server going down, no
+                                   requests allowed anymore.
+                                   Bit 0 = 1 ==> Server is down. */
+        int buffer_size;        /* Negotiated bufsize */
+        int reply_size;         /* Size of last reply */
+        int packet_size;
+        unsigned char *packet;  /* Here we prepare requests and
+                                   receive replies */
+        unsigned char *txbuf;   /* Storage for current request */
+        unsigned char *rxbuf;   /* Storage for reply to current request */
+        int lock;               /* To prevent mismatch in protocols. */
+        struct mutex mutex;
+        int current_size;       /* for packet preparation */
+        int has_subfunction;
+        int ncp_reply_size;
+        int root_setuped;
+        struct mutex root_setup_lock;
+        /* info for packet signing */
+        int sign_wanted;        /* 1=Server needs signed packets */
+        int sign_active;        /* 0=don't do signing, 1=do */
+        char sign_root[8];      /* generated from password and encr. key */
+        char sign_last[16];     
+        /* Authentication info: NDS or BINDERY, username */
+        struct {
+                int     auth_type;
+                size_t  object_name_len;
+                void*   object_name;
+                int     object_type;
+        } auth;
+        /* Password info */
+        struct {
+                size_t  len;
+                void*   data;
+        } priv;
+        struct rw_semaphore auth_rwsem;
+        /* nls info: codepage for volume and charset for I/O */
+        struct nls_table *nls_vol;
+        struct nls_table *nls_io;
+        /* maximum age in jiffies */
+        atomic_t dentry_ttl;
+        /* miscellaneous */
+        unsigned int flags;
+        spinlock_t requests_lock;       /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+        void (*data_ready)(struct sock* sk, int len);
+        void (*error_report)(struct sock* sk);
+        void (*write_space)(struct sock* sk);   /* STREAM mode only */
+        struct {
+                struct work_struct tq;          /* STREAM/DGRAM: data/error ready */
+                struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
+                struct mutex creq_mutex;        /* DGRAM only: lock accesses to rcv.creq */
+                unsigned int state;             /* STREAM only: receiver state */
+                struct {
+                        __u32 magic __packed;
+                        __u32 len __packed;
+                        __u16 type __packed;
+                        __u16 p1 __packed;
+                        __u16 p2 __packed;
+                        __u16 p3 __packed;
+                        __u16 type2 __packed;
+                } buf;                          /* STREAM only: temporary buffer */
+                unsigned char* ptr;             /* STREAM only: pointer to data */
+                size_t len;                     /* STREAM only: length of data to receive */
+        } rcv;
+        struct {
+                struct list_head requests;      /* STREAM only: queued requests */
+                struct work_struct tq;          /* STREAM only: transmitter ready */
+                struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
+        } tx;
+        struct timer_list timeout_tm;           /* DGRAM only: timeout timer */
+        struct work_struct timeout_tq;          /* DGRAM only: associated queue, we run timers from process context */
+        int timeout_last;                       /* DGRAM only: current timeout length */
+        int timeout_retries;                    /* DGRAM only: retries left */
+        struct {
+                size_t len;
+                __u8 data[128];
+        } unexpected_packet;
+        struct backing_dev_info bdi;
+};
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+#define NCP_FLAG_UTF8   1
+#define NCP_CLR_FLAG(server, flag)      ((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)      ((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)       ((server)->flags & (flag))
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+        return ((server->conn_status & 0x11) == 0);
+}
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+        server->conn_status |= 0x01;
+}
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6a..981a95617fc 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 static inline void assert_server_locked(struct ncp_server *server)
 {
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634c..09881e6aa5a 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
 #include <linux/ctype.h>
 #endif /* CONFIG_NCPFS_NLS */
-#include <linux/ncp_fs.h>
 #define NCP_MIN_SYMLINK_SIZE    8
 #define NCP_MAX_SYMLINK_SIZE    512
@@ -135,7 +133,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
 #define NCP_ESC                 ':'
-#define NCP_IO_TABLE(dentry)    (NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)        (NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)       nls_tolower(t, c)
 #define ncp_toupper(t, c)       nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +148,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
-#define NCP_IO_TABLE(dentry)    NULL
+#define NCP_IO_TABLE(sb)        NULL
 #define ncp_tolower(t, c)       tolower(c)
 #define ncp_toupper(t, c)       toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
+static inline int ncp_strnicmp(const struct nls_table *t,
-                const unsigned char *s2, int len)
+                const unsigned char *s1, const unsigned char *s2, int len)
 {
        while (len--) {
                if (tolower(*s1++) != tolower(*s2++))
@@ -193,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -205,7 +203,7 @@ ncp_renew_dentries(struct dentry *parent)
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 static inline void
@@ -215,7 +213,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -223,7 +221,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
                ncp_age_dentry(server, dentry);
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 struct ncp_cache_head {
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910..08907599dcd 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/ncp.h>
 #include <linux/bitops.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
 /* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381c..d9a1438bb1f 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
 #ifndef _NCPSIGN_KERNEL_H
 #define _NCPSIGN_KERNEL_H
-#include <linux/ncp_fs.h>
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
 int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346..3a1587222c8 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
 #include <linux/poll.h>
 #include <linux/file.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b33..661f861d80c 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/ncp_fs.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* these magic numbers must appear in the symlink file -- this makes it a bit
   more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe81..e3d29426905 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
@@ -17,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 #include <net/inet_sock.h>
@@ -178,30 +175,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp;
-        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        int ret;
-        dprintk("--> %s\n", __func__);
+        /*
-        /* Create a svc_sock for the service */
+         * Create an svc_sock for the back channel service that shares the
-        bc_xprt = svc_sock_create(serv, xprt->prot);
+         * fore channel connection.
-        if (!bc_xprt)
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+        if (ret < 0) {
+                rqstp = ERR_PTR(ret);
                goto out;
+        }
        /*
         * Save the svc_serv in the transport so that it can
         * be referenced when the session backchannel is initialized
         */
-        serv->bc_xprt = bc_xprt;
        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp))
+        if (IS_ERR(rqstp)) {
-                svc_sock_destroy(bc_xprt);
+                svc_xprt_put(serv->sv_bc_xprt);
+                serv->sv_bc_xprt = NULL;
+        }
 out:
-        dprintk("--> %s return %p\n", __func__, rqstp);
+        dprintk("--> %s return %ld\n", __func__,
+                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
@@ -323,58 +328,58 @@ void nfs_callback_down(int minorversion)
        mutex_unlock(&nfs_callback_mutex);
 }
-static int check_gss_callback_principal(struct nfs_client *clp,
+/* Boolean check of RPC_AUTH_GSS principal */
-                                        struct svc_rqst *rqstp)
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+                return 1;
+        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+        if (clp->cl_minorversion != 0)
+                return 0;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
         */
        if (p == NULL)
-                return SVC_DENIED;
+                return 0;
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
        if (memcmp(p, "nfs@", 4) != 0)
-                return SVC_DENIED;
+                return 0;
        p += 4;
        if (strcmp(p, r->cl_server) != 0)
-                return SVC_DENIED;
+                return 0;
-        return SVC_OK;
+        return 1;
 }
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-        struct nfs_client *clp;
-        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-        int ret = SVC_OK;
-        /* Don't talk to strangers */
-        clp = nfs_find_client(svc_addr(rqstp), 4);
-        if (clp == NULL)
-                return SVC_DROP;
-        dprintk("%s: %s NFSv4 callback!\n", __func__,
-                        svc_print_addr(rqstp, buf, sizeof(buf)));
        switch (rqstp->rq_authop->flavour) {
-                case RPC_AUTH_NULL:
+        case RPC_AUTH_NULL:
-                        if (rqstp->rq_proc != CB_NULL)
+                if (rqstp->rq_proc != CB_NULL)
-                                ret = SVC_DENIED;
+                        return SVC_DROP;
-                        break;
+                break;
-                case RPC_AUTH_UNIX:
+        case RPC_AUTH_GSS:
-                        break;
+                /* No RPC_AUTH_GSS support yet in NFSv4.1 */
-                case RPC_AUTH_GSS:
+                 if (svc_is_backchannel(rqstp))
-                        ret = check_gss_callback_principal(clp, rqstp);
+                        return SVC_DROP;
-                        break;
-                default:
-                        ret = SVC_DENIED;
        }
-        nfs_put_client(clp);
+        return SVC_OK;
-        return ret;
 }
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8d..46d93ce7311 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
 */
 #ifndef __LINUX_FS_NFS_CALLBACK_H
 #define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
 #define NFS4_CALLBACK 0x40000000
 #define NFS4_CALLBACK_XDRSIZE 2048
@@ -34,10 +35,16 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct cb_process_state {
+        __be32                  drc_status;
+        struct nfs_client       *clp;
+};
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
        unsigned int minorversion;
+        unsigned int cb_ident; /* v4.0 callback identifier */
        unsigned nops;
 };
@@ -103,14 +110,23 @@ struct cb_sequenceres {
        uint32_t                        csr_target_highestslotid;
 };
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                       struct cb_sequenceres *res);
+                                       struct cb_sequenceres *res,
+                                       struct cb_process_state *cps);
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
                                             const nfs4_stateid *stateid);
 #define RCA4_TYPE_MASK_RDATA_DLG        0
 #define RCA4_TYPE_MASK_WDATA_DLG        1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 struct cb_recallanyargs {
        struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
        uint32_t        craa_type_mask;
 };
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
        uint32_t        crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
-                                          void *dummy);
+                                         void *dummy,
+                                         struct cb_process_state *cps);
-#endif /* CONFIG_NFS_V4_1 */
+struct cb_layoutrecallargs {
+        struct sockaddr         *cbl_addr;
+        uint32_t                cbl_recall_type;
+        uint32_t                cbl_layout_type;
+        uint32_t                cbl_layoutchanged;
+        union {
+                struct {
+                        struct nfs_fh           cbl_fh;
+                        struct pnfs_layout_range cbl_range;
+                        nfs4_stateid            cbl_stateid;
+                };
+                struct nfs_fsid         cbl_fsid;
+        };
+};
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern unsigned nfs4_callback_layoutrecall(
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+        struct cb_layoutrecallargs *args,
+        void *dummy, struct cb_process_state *cps);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                                    struct cb_getattrres *res,
+                                    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                                   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61..89587573fe5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                             struct cb_getattrres *res,
+                             struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
+        res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+                goto out;
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs_find_client(args->addr, 4);
-        if (clp == NULL)
-                goto out;
        dprintk("NFS: GETATTR callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        inode = nfs_delegation_find_inode(clp, &args->fh);
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
        if (inode == NULL)
-                goto out_putclient;
+                goto out;
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
        rcu_read_unlock();
        iput(inode);
-out_putclient:
-        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
        return res->status;
 }
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                            struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct inode *inode;
        __be32 res;
        
-        res = htonl(NFS4ERR_BADHANDLE);
+        res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->addr, 4);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        do {
+        res = htonl(NFS4ERR_BADHANDLE);
-                struct nfs_client *prev = clp;
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+        if (inode == NULL)
-                inode = nfs_delegation_find_inode(clp, &args->fh);
+                goto out;
-                if (inode != NULL) {
+        /* Set up a helper thread to actually return the delegation */
-                        /* Set up a helper thread to actually return the delegation */
+        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        case 0:
-                                case 0:
+                res = 0;
-                                        res = 0;
+                break;
-                                        break;
+        case -ENOENT:
-                                case -ENOENT:
+                if (res != 0)
-                                        if (res != 0)
+                        res = htonl(NFS4ERR_BAD_STATEID);
-                                                res = htonl(NFS4ERR_BAD_STATEID);
+                break;
-                                        break;
+        default:
-                                default:
+                res = htonl(NFS4ERR_RESOURCE);
-                                        res = htonl(NFS4ERR_RESOURCE);
+        }
-                        }
+        iput(inode);
-                        iput(inode);
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        bool found = false;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if (nfs_compare_fh(&args->cbl_fh,
+                                   &NFS_I(lo->plh_inode)->fh))
+                        continue;
+                ino = igrab(lo->plh_inode);
+                if (!ino)
+                        continue;
+                found = true;
+                /* Without this, layout can be freed as soon
+                 * as we release cl_lock.
+                 */
+                get_layout_hdr(lo);
+                break;
+        }
+        spin_unlock(&clp->cl_lock);
+        if (!found)
+                return NFS4ERR_NOMATCHING_LAYOUT;
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+            mark_matching_lsegs_invalid(lo, &free_me_list,
+                                        args->cbl_range.iomode))
+                rv = NFS4ERR_DELAY;
+        else
+                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me_list);
+        put_layout_hdr(lo);
+        iput(ino);
+        return rv;
+}
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        struct pnfs_layout_hdr *tmp;
+        LIST_HEAD(recall_list);
+        LIST_HEAD(free_me_list);
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if ((args->cbl_recall_type == RETURN_FSID) &&
+                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                        continue;
+                if (!igrab(lo->plh_inode))
+                        continue;
+                get_layout_hdr(lo);
+                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_add(&lo->plh_bulk_recall, &recall_list);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(lo, tmp,
+                                 &recall_list, plh_bulk_recall) {
+                ino = lo->plh_inode;
+                spin_lock(&ino->i_lock);
+                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                        rv = NFS4ERR_DELAY;
+                list_del_init(&lo->plh_bulk_recall);
+                spin_unlock(&ino->i_lock);
+                put_layout_hdr(lo);
+                iput(ino);
+        }
+        pnfs_free_lseg_list(&free_me_list);
+        return rv;
+}
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+                                    struct cb_layoutrecallargs *args)
+{
+        u32 res = NFS4ERR_DELAY;
+        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+                goto out;
+        if (args->cbl_recall_type == RETURN_FILE)
+                res = initiate_file_draining(clp, args);
+        else
+                res = initiate_bulk_draining(clp, args);
+        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+        dprintk("%s returning %i\n", __func__, res);
+        return res;
+}
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        u32 res;
+        dprintk("%s: -->\n", __func__);
+        if (cps->clp)
+                res = do_callback_layoutrecall(cps->clp, args);
+        else
+                res = NFS4ERR_OP_NOT_IN_SESSION;
+        dprintk("%s: exit with status = %d\n", __func__, res);
+        return cpu_to_be32(res);
+}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+        struct cb_layoutrecallargs args;
+        /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+        memset(&args, 0, sizeof(args));
+        args.cbl_recall_type = RETURN_ALL;
+        /* FIXME we ignore errors, what should we do? */
+        do_callback_layoutrecall(clp, &args);
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-        const struct sockaddr *addr, u32 nfsversion,
-        struct nfs4_sessionid *sessionid)
-{
-        struct nfs_client *clp;
-        clp = nfs_find_client(addr, 4);
-        if (clp == NULL)
-                return NULL;
-        do {
-                struct nfs_client *prev = clp;
-                if (clp->cl_session != NULL) {
-                        if (memcmp(clp->cl_session->sess_id.data,
-                                        sessionid->data,
-                                        NFS4_MAX_SESSIONID_LEN) == 0) {
-                                /* Returns a held reference to clp */
-                                return clp;
-                        }
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
-        return NULL;
-}
-/*
 * For each referring call triple, check the session's slot table for
 * a match.  If the slot is in use and the sequence numbers match, the
 * client is still waiting for a response to the original request.
@@ -276,20 +368,28 @@ out:
 }
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                struct cb_sequenceres *res)
+                              struct cb_sequenceres *res,
+                              struct cb_process_state *cps)
 {
        struct nfs_client *clp;
        int i;
-        __be32 status;
+        __be32 status = htonl(NFS4ERR_BADSESSION);
-        status = htonl(NFS4ERR_BADSESSION);
+        cps->clp = NULL;
-        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
+        /* state manager is resetting the session */
+        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+                status = NFS4ERR_DELAY;
+                goto out;
+        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
        if (status)
-                goto out_putclient;
+                goto out;
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -298,7 +398,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-                goto out_putclient;
+                goto out;
        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +407,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        nfs4_cb_take_slot(clp);
-out_putclient:
-        nfs_put_client(clp);
 out:
+        cps->clp = clp; /* put in nfs4_callback_compound */
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                res->csr_status = 0;
+                cps->drc_status = status;
-        else
+                status = 0;
+        } else
                res->csr_status = status;
        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
                ntohl(status), ntohl(res->csr_status));
        return status;
 }
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+        return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+                               struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        __be32 status;
        fmode_t flags = 0;
-        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->craa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL_ANY callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        status = cpu_to_be32(NFS4ERR_INVAL);
+        if (!validate_bitmap_values(args->craa_type_mask))
+                goto out;
+        status = cpu_to_be32(NFS4_OK);
        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags = FMODE_READ;
        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags |= FMODE_WRITE;
+        if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+                     &args->craa_type_mask))
+                pnfs_recall_all_layouts(cps->clp);
        if (flags)
-                nfs_expire_all_delegation_types(clp, flags);
+                nfs_expire_all_delegation_types(cps->clp, flags);
-        status = htonl(NFS4_OK);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+                                struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs4_slot_table *fc_tbl;
        __be32 status;
        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->crsa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
-        fc_tbl = &clp->cl_session->fc_slot_table;
+        fc_tbl = &cps->clp->cl_session->fc_slot_table;
        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
            args->crsa_target_max_slots < 1)
-                goto out_putclient;
+                goto out;
        status = htonl(NFS4_OK);
        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out_putclient;
+                goto out;
        fc_tbl->target_max_slots = args->crsa_target_max_slots;
-        nfs41_handle_recall_slot(clp);
+        nfs41_handle_recall_slot(cps->clp);
-out_putclient:
-        nfs_put_client(clp);    /* balance nfs_find_client */
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0ed..14e0f9371d1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR    11050
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+                                        struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        hdr->minorversion = ntohl(*p++);
        /* Check minor version is zero or one. */
        if (hdr->minorversion <= 1) {
-                p++;    /* skip callback_ident */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
                printk(KERN_WARNING "%s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct cb_layoutrecallargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        uint32_t iomode;
+        args->cbl_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        args->cbl_layout_type = ntohl(*p++);
+        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+         * as it is unuseable and ignored with the other types.
+         */
+        iomode = ntohl(*p++);
+        args->cbl_layoutchanged = ntohl(*p++);
+        args->cbl_recall_type = ntohl(*p++);
+        if (args->cbl_recall_type == RETURN_FILE) {
+                args->cbl_range.iomode = iomode;
+                status = decode_fh(xdr, &args->cbl_fh);
+                if (unlikely(status != 0))
+                        goto out;
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_range.offset);
+                p = xdr_decode_hyper(p, &args->cbl_range.length);
+                status = decode_stateid(xdr, &args->cbl_stateid);
+                if (unlikely(status != 0))
+                        goto out;
+        } else if (args->cbl_recall_type == RETURN_FSID) {
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+                p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+        } else if (args->cbl_recall_type != RETURN_ALL) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+                __func__,
+                args->cbl_layout_type, iomode,
+                args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_SEQUENCE:
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
+        case OP_CB_LAYOUTRECALL:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+        struct nfs4_slot_table *tbl = &session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        /*
+         * Let the state manager know callback processing done.
+         * A single slot, so highest used slotid is either 0 or -1
+         */
+        tbl->highest_used_slotid--;
+        nfs4_check_drain_bc_complete(session);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+        if (clp && clp->cl_session)
+                nfs4_callback_free_slot(clp->cl_session);
+}
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        tbl->highest_used_slotid++;
+        BUG_ON(tbl->highest_used_slotid != 0);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
 #else /* CONFIG_NFS_V4_1 */
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp, int* drc_status)
+                struct xdr_stream *xdr_out, void *resp,
+                struct cb_process_state *cps)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (status)
                goto encode_hdr;
-        if (*drc_status) {
+        if (cps->drc_status) {
-                status = *drc_status;
+                status = cps->drc_status;
                goto encode_hdr;
        }
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                status = op->decode_args(rqstp, xdr_in, argp);
                if (likely(status == 0))
-                        status = op->process_op(argp, resp);
+                        status = op->process_op(argp, resp, cps);
        } else
                status = htonl(NFS4ERR_RESOURCE);
-        /* Only set by OP_CB_SEQUENCE processing */
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                *drc_status = status;
-                status = 0;
-        }
 encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
        if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_arg hdr_arg = { 0 };
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
-        __be32 *p;
+        __be32 *p, status;
-        __be32 status, drc_status = 0;
+        struct cb_process_state cps = {
+                .drc_status = 0,
+                .clp = NULL,
+        };
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -696,6 +792,12 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (status == __constant_htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
+        if (hdr_arg.minorversion == 0) {
+                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
+                        return rpc_drop_reply;
+        }
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +805,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        while (status == 0 && nops != hdr_arg.nops) {
                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
+                                    &xdr_in, argp, &xdr_out, resp, &cps);
                nops++;
        }
@@ -716,6 +818,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
+        nfs4_cb_free_slot(cps.clp);
+        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
 }
@@ -739,6 +843,12 @@ static struct callback_op callback_ops[] = {
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
        },
 #if defined(CONFIG_NFS_V4_1)
+        [OP_CB_LAYOUTRECALL] = {
+                .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+                .decode_args =
+                        (callback_decode_arg_t)decode_layoutrecall_args,
+                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc..bd3ca32879e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+        int ret = 0;
+        if (clp->rpc_ops->version != 4 || minorversion != 0)
+                return ret;
+retry:
+        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+                return -ENOMEM;
+        spin_lock(&nfs_client_lock);
+        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        spin_unlock(&nfs_client_lock);
+        if (ret == -EAGAIN)
+                goto retry;
+        return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        INIT_LIST_HEAD(&clp->cl_delegations);
+        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+        if (err)
+                goto error_cleanup;
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
 }
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
                nfs4_destroy_session(clp->cl_session);
-                clp->cl_session = NULL;
-        }
-        clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
                nfs4_kill_renewd(clp);
-        nfs4_clear_client_minor_version(clp);
+        nfs4_shutdown_session(clp);
        nfs4_destroy_callback(clp);
        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
                nfs_idmap_delete(clp);
        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+        idr_destroy(&cb_ident_idr);
+}
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+        if (clp->cl_cb_ident)
+                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 /*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
                list_del(&clp->cl_share_link);
+                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
-/*
+/* Common match routine for v4.0 and v4.1 callback services */
- * Find a client by IP address and protocol version
+bool
- * - returns NULL if no such client
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- */
+                     u32 minorversion)
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 {
-        struct nfs_client *clp;
+        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-        spin_lock(&nfs_client_lock);
+        /* Don't match clients that failed to initialise */
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+        if (!(clp->cl_cons_state == NFS_CS_READY ||
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+            clp->cl_cons_state == NFS_CS_SESSION_INITING))
+                return false;
-                /* Don't match clients that failed to initialise properly */
+        /* Match the version and minorversion */
-                if (!(clp->cl_cons_state == NFS_CS_READY ||
+        if (clp->rpc_ops->version != 4 ||
-                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
+            clp->cl_minorversion != minorversion)
-                        continue;
+                return false;
-                /* Different NFS versions cannot share the same nfs_client */
+        /* Match only the IP address, not the port number */
-                if (clp->rpc_ops->version != nfsversion)
+        if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
+                return false;
-                /* Match only the IP address, not the port number */
+        return true;
-                if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
-{
-        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
-        u32 nfsvers = clp->rpc_ops->version;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
-                if (clp->cl_cons_state != NFS_CS_READY)
-                        continue;
-                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->rpc_ops->version != nfsvers)
-                        continue;
-                /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(sap, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
 }
 /*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
        target->options = source->options;
 }
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        spin_lock(&nfs_client_lock);
+        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+}
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+        spin_lock(&nfs_client_lock);
+        list_del_rcu(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        synchronize_rcu();
+}
 /*
 * Allocate and initialise a server record
 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
        /* Zero out the NFS state stuff */
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        INIT_LIST_HEAD(&server->delegations);
        atomic_set(&server->active, 0);
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        pnfs_init_server(server);
        return server;
 }
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        nfs_server_remove_lists(server);
        unset_pnfs_layoutdriver(server);
-        spin_lock(&nfs_client_lock);
-        list_del(&server->client_link);
-        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
        if (server->destroy != NULL)
                server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr);
        return server;
@@ -1125,6 +1156,96 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 0) == false)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = idr_find(&cb_ident_idr, cb_ident);
+        if (clp)
+                atomic_inc(&clp->cl_count);
+        spin_unlock(&nfs_client_lock);
+        return clp;
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                        continue;
+                if (!nfs4_has_session(clp))
+                        continue;
+                /* Match sessionid*/
+                if (memcmp(clp->cl_session->sess_id.data,
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+#else /* CONFIG_NFS_V4_1 */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid)
+{
+        return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
 * Initialize the NFS4 callback service
 */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1463,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
 out:
        nfs_free_fattr(fattr);
@@ -1551,11 +1668,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (error < 0)
                goto out_free_server;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33..bbbc6bf5cb2 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
@@ -24,8 +23,6 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-        if (delegation->cred)
-                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -38,14 +35,30 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred) {
+                put_rpccred(delegation->cred);
+                delegation->cred = NULL;
+        }
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
@@ -120,10 +133,15 @@ again:
        return 0;
 }
-/*
+/**
- * Set up a delegation on an inode
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
 */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+                                  struct nfs_openres *res)
 {
        struct nfs_delegation *delegation;
        struct rpc_cred *oldcred = NULL;
@@ -176,38 +194,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+static struct nfs_delegation *
-                                                           const nfs4_stateid *stateid,
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                                                           struct nfs_client *clp)
+                             struct nfs_server *server)
 {
        struct nfs_delegation *delegation =
                rcu_dereference_protected(nfsi->delegation,
-                                          lockdep_is_held(&clp->cl_lock));
+                                lockdep_is_held(&server->nfs_client->cl_lock));
        if (delegation == NULL)
                goto nomatch;
        spin_lock(&delegation->lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch_unlock:
-        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
-/*
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- * Set up a delegation on an inode
+                                                    struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs_delegation *delegation;
+        spin_lock(&clp->cl_lock);
+        delegation = nfs_detach_delegation_locked(nfsi, server);
+        spin_unlock(&clp->cl_lock);
+        return delegation;
+}
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
@@ -228,7 +260,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock(&clp->cl_lock);
        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                                                   lockdep_is_held(&clp->cl_lock));
+                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
                if (memcmp(&delegation->stateid, &old_delegation->stateid,
                                        sizeof(old_delegation->stateid)) == 0 &&
@@ -247,9 +279,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+                freeme = nfs_detach_delegation_locked(nfsi, server);
        }
-        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -291,73 +323,85 @@ out:
        return err;
 }
-/*
+/**
- * Return all delegations that have been marked for return
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
        int err = 0;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-                if (inode == NULL)
+                                                        &delegation->flags))
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL) {
+                                                                server);
-                        filemap_flush(inode->i_mapping);
+                        rcu_read_unlock();
-                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                        if (delegation != NULL) {
+                                filemap_flush(inode->i_mapping);
+                                err = __nfs_inode_return_delegation(inode,
+                                                                delegation, 0);
+                        }
+                        iput(inode);
+                        if (!err)
+                                goto restart;
+                        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        return err;
                }
-                iput(inode);
-                if (!err)
-                        goto restart;
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                return err;
        }
        rcu_read_unlock();
        return 0;
 }
-/*
+/**
- * This function returns the delegation without reclaiming opens
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
- * or protecting against delegation reclaims.
+ * @inode: inode to process
- * It is therefore really only safe to be called from
+ *
- * nfs4_clear_inode()
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
        }
 }
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -366,46 +410,61 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
-/*
+/**
- * Return all delegations associated to a super block
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
 */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        if (clp == NULL)
                return;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
-                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
        if (nfs_client_return_marked_delegations(clp) != 0)
                nfs4_schedule_state_manager(clp);
 }
-static
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+                                                 fmode_t flags)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(clp, delegation);
+                        nfs_mark_return_delegation(delegation);
        }
+}
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+                                                        fmode_t flags)
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_all_delegation_types(server, flags);
        rcu_read_unlock();
 }
@@ -420,19 +479,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/*
+/**
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -441,29 +513,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
        nfs_client_mark_return_all_delegations(clp);
 }
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(clp, delegation);
+                nfs_mark_return_delegation(delegation);
        }
-        rcu_read_unlock();
 }
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-        nfs_client_mark_return_unreferenced_delegations(clp);
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_unreferenced_delegations(server);
+        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
 }
-/*
+/**
- * Asynchronous delegation recall!
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+                                      const nfs4_stateid *stateid)
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -475,22 +561,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
                rcu_read_unlock();
                return -ENOENT;
        }
+        nfs_mark_return_delegation(delegation);
-        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
 }
-/*
+static struct inode *
- * Retrieve the inode associated with a delegation
+nfs_delegation_find_inode_server(struct nfs_server *server,
- */
+                                 const struct nfs_fh *fhandle)
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL &&
                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -500,49 +585,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
                if (res != NULL)
                        break;
        }
+        return res;
+}
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+                                        const struct nfs_fh *fhandle)
+{
+        struct nfs_server *server;
+        struct inode *res = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                res = nfs_delegation_find_inode_server(server, fhandle);
+                if (res != NULL)
+                        break;
+        }
        rcu_read_unlock();
        return res;
 }
-/*
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
- * Mark all delegations as needing to be reclaimed
+{
+        struct nfs_delegation *delegation;
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                nfs_delegation_mark_reclaim_server(server);
        rcu_read_unlock();
 }
-/*
+/**
- * Reap all unclaimed delegations after reboot recovery is done
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                if (inode == NULL)
+                                                &delegation->flags) == 0)
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL)
+                                                                server);
-                        nfs_free_delegation(delegation);
+                        rcu_read_unlock();
-                iput(inode);
-                goto restart;
+                        if (delegation != NULL)
+                                nfs_free_delegation(delegation);
+                        iput(inode);
+                        goto restart;
+                }
        }
        rcu_read_unlock();
 }
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        int ret = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                if (!list_empty(&server->delegations)) {
+                        ret = 1;
+                        break;
+                }
+        rcu_read_unlock();
+        return ret;
+}
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda1..d9322e490c5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e56..2c3eb33b904 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,7 +33,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
-const struct address_space_operations nfs_dir_addr_space_ops = {
+const struct address_space_operations nfs_dir_aops = {
-        .releasepage = nfs_readdir_clear_array,
+        .freepage = nfs_readdir_clear_array,
 };
 #ifdef CONFIG_NFS_V3
@@ -124,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 #endif /* CONFIG_NFS_V4 */
@@ -161,6 +163,7 @@ struct nfs_cache_array_entry {
        u64 cookie;
        u64 ino;
        struct qstr string;
+        unsigned char d_type;
 };
 struct nfs_cache_array {
@@ -170,14 +173,13 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
        u64             *dir_cookie;
+        u64             last_cookie;
        loff_t          current_index;
        decode_dirent_t decode;
@@ -194,9 +196,13 @@ typedef struct {
 static
 struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
 {
+        void *ptr;
        if (page == NULL)
                return ERR_PTR(-EIO);
-        return (struct nfs_cache_array *)kmap(page);
+        ptr = kmap(page);
+        if (ptr == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ptr;
 }
 static
@@ -209,14 +215,15 @@ void nfs_readdir_release_array(struct page *page)
 * we are freeing strings created by nfs_add_to_readdir_array()
 */
 static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
 {
-        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array *array;
        int i;
+        array = kmap_atomic(page, KM_USER0);
        for (i = 0; i < array->size; i++)
                kfree(array->array[i].string.name);
-        nfs_readdir_release_array(page);
+        kunmap_atomic(array, KM_USER0);
-        return 0;
 }
 /*
@@ -231,6 +238,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
        string->name = kmemdup(name, len, GFP_KERNEL);
        if (string->name == NULL)
                return -ENOMEM;
+        /*
+         * Avoid a kmemleak false positive. The pointer to the name is stored
+         * in a page cache page which kmemleak does not scan.
+         */
+        kmemleak_not_leak(string->name);
        string->hash = full_name_hash(name, len);
        return 0;
 }
@@ -244,20 +256,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
        if (IS_ERR(array))
                return PTR_ERR(array);
-        ret = -EIO;
-        if (array->size >= MAX_READDIR_ARRAY)
-                goto out;
        cache_entry = &array->array[array->size];
+        /* Check that this entry lies within the page bounds */
+        ret = -ENOSPC;
+        if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+                goto out;
        cache_entry->cookie = entry->prev_cookie;
        cache_entry->ino = entry->ino;
+        cache_entry->d_type = entry->d_type;
        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
        if (ret)
                goto out;
        array->last_cookie = entry->cookie;
-        if (entry->eof == 1)
-                array->eof_index = array->size;
        array->size++;
+        if (entry->eof != 0)
+                array->eof_index = array->size;
 out:
        nfs_readdir_release_array(page);
        return ret;
@@ -272,7 +288,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        if (diff < 0)
                goto out_eof;
        if (diff >= array->size) {
-                if (array->eof_index > 0)
+                if (array->eof_index >= 0)
                        goto out_eof;
                desc->current_index += array->size;
                return -EAGAIN;
@@ -281,8 +297,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-        if (index == array->eof_index)
-                desc->eof = 1;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -296,17 +310,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int status = -EAGAIN;
        for (i = 0; i < array->size; i++) {
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        status = -EBADCOOKIE;
-                }
                if (array->array[i].cookie == *desc->dir_cookie) {
                        desc->cache_entry_index = i;
-                        status = 0;
+                        return 0;
-                        break;
                }
        }
+        if (array->eof_index >= 0) {
+                status = -EBADCOOKIE;
+                if (*desc->dir_cookie == array->last_cookie)
+                        desc->eof = 1;
+        }
        return status;
 }
@@ -314,10 +327,7 @@ static
 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 {
        struct nfs_cache_array *array;
-        int status = -EBADCOOKIE;
+        int status;
-        if (desc->dir_cookie == NULL)
-                goto out;
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
@@ -330,6 +340,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
        else
                status = nfs_readdir_search_for_cookie(array, desc);
+        if (status == -EAGAIN) {
+                desc->last_cookie = array->last_cookie;
+                desc->page_index++;
+        }
        nfs_readdir_release_array(desc->page);
 out:
        return status;
@@ -365,14 +379,14 @@ error:
        return error;
 }
-/* Fill in an entry based on the xdr code stored in desc->page */
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
-static
+                      struct nfs_entry *entry, struct xdr_stream *xdr)
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
+        int error;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        error = desc->decode(xdr, entry, desc->plus);
+        if (error)
+                return error;
        entry->fattr->time_start = desc->timestamp;
        entry->fattr->gencount = desc->gencount;
        return 0;
@@ -381,13 +395,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
 static
 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        struct nfs_inode *node;
        if (dentry->d_inode == NULL)
                goto different;
-        node = NFS_I(dentry->d_inode);
+        if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
-        if (node->fh.size != entry->fh->size)
-                goto different;
-        if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
                goto different;
        return 1;
 different:
@@ -429,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
        if (IS_ERR(inode))
                goto out;
@@ -449,43 +458,58 @@ out:
 /* Perform conversion from xdr to cache array */
 static
-void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-                                void *xdr_page, struct page *page, unsigned int buflen)
+                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf;
+        struct xdr_buf buf = {
-        __be32 *ptr = xdr_page;
+                .pages = xdr_pages,
-        int status;
+                .page_len = buflen,
+                .buflen = buflen,
+                .len = buflen,
+        };
+        struct page *scratch;
        struct nfs_cache_array *array;
+        unsigned int count = 0;
+        int status;
-        buf.head->iov_base = xdr_page;
+        scratch = alloc_page(GFP_KERNEL);
-        buf.head->iov_len = buflen;
+        if (scratch == NULL)
-        buf.tail->iov_len = 0;
+                return -ENOMEM;
-        buf.page_base = 0;
-        buf.page_len = 0;
-        buf.buflen = buf.head->iov_len;
-        buf.len = buf.head->iov_len;
-        xdr_init_decode(&stream, &buf, ptr);
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        do {
                status = xdr_decode(desc, entry, &stream);
-                if (status != 0)
+                if (status != 0) {
+                        if (status == -EAGAIN)
+                                status = 0;
                        break;
+                }
-                if (nfs_readdir_add_to_array(entry, page) == -1)
+                count++;
-                        break;
-                if (desc->plus == 1)
+                if (desc->plus != 0)
                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+                status = nfs_readdir_add_to_array(entry, page);
+                if (status != 0)
+                        break;
        } while (!entry->eof);
-        if (status == -EBADCOOKIE && entry->eof) {
+        if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
                array = nfs_readdir_get_array(page);
-                array->eof_index = array->size - 1;
+                if (!IS_ERR(array)) {
-                status = 0;
+                        array->eof_index = array->size;
-                nfs_readdir_release_array(page);
+                        status = 0;
+                        nfs_readdir_release_array(page);
+                } else
+                        status = PTR_ERR(array);
        }
+        put_page(scratch);
+        return status;
 }
 static
@@ -500,7 +524,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
                unsigned int npages)
 {
-        vm_unmap_ram(ptr, npages);
        nfs_readdir_free_pagearray(pages, npages);
 }
@@ -509,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 * to nfs_readdir_free_large_page
 */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        void *ptr;
        unsigned int i;
        for (i = 0; i < npages; i++) {
@@ -520,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
                        goto out_freepages;
                pages[i] = page;
        }
+        return 0;
-        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-        if (!IS_ERR_OR_NULL(ptr))
-                return ptr;
 out_freepages:
        nfs_readdir_free_pagearray(pages, i);
-        return NULL;
+        return -ENOMEM;
 }
 static
@@ -537,31 +557,43 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        struct nfs_entry entry;
        struct file     *file = desc->file;
        struct nfs_cache_array *array;
-        int status = 0;
+        int status = -ENOMEM;
        unsigned int array_size = ARRAY_SIZE(pages);
        entry.prev_cookie = 0;
-        entry.cookie = *desc->dir_cookie;
+        entry.cookie = desc->last_cookie;
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
+        entry.server = NFS_SERVER(inode);
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
        array = nfs_readdir_get_array(page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
-        pages_ptr = nfs_readdir_large_page(pages, array_size);
+        status = nfs_readdir_large_page(pages, array_size);
-        if (!pages_ptr)
+        if (status < 0)
                goto out_release_array;
        do {
+                unsigned int pglen;
                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
                if (status < 0)
                        break;
-                nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+                pglen = status;
-        } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+                status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+                if (status < 0) {
+                        if (status == -ENOSPC)
+                                status = 0;
+                        break;
+                }
+        } while (array->eof_index < 0);
        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
@@ -582,8 +614,10 @@ static
 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
        struct inode    *inode = desc->file->f_path.dentry->d_inode;
+        int ret;
-        if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+        ret = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (ret < 0)
                goto error;
        SetPageUptodate(page);
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
        return 0;
 error:
        unlock_page(page);
-        return -EIO;
+        return ret;
 }
 static
 void cache_page_release(nfs_readdir_descriptor_t *desc)
 {
+        if (!desc->page->mapping)
+                nfs_readdir_clear_array(desc->page);
        page_cache_release(desc->page);
        desc->page = NULL;
 }
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
 static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
-        struct page *page;
+        return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
-        page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
-        if (IS_ERR(page))
-                desc->eof = 1;
-        return page;
 }
 /*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
                return PTR_ERR(desc->page);
        res = nfs_readdir_search_array(desc);
-        if (res == 0)
+        if (res != 0)
-                return 0;
+                cache_page_release(desc);
-        cache_page_release(desc);
        return res;
 }
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-        int res = -EAGAIN;
+        int res;
-        while (1) {
+        if (desc->page_index == 0) {
-                res = find_cache_page(desc);
+                desc->current_index = 0;
-                if (res != -EAGAIN)
+                desc->last_cookie = 0;
-                        break;
-                desc->page_index++;
        }
+        do {
+                res = find_cache_page(desc);
+        } while (res == -EAGAIN);
        return res;
 }
-static inline unsigned int dt_type(struct inode *inode)
-{
-        return (inode->i_mode >> 12) & 15;
-}
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        int i = 0;
        int res = 0;
        struct nfs_cache_array *array = NULL;
-        unsigned int d_type = DT_UNKNOWN;
-        struct dentry *dentry = NULL;
        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                res = PTR_ERR(array);
+                goto out;
+        }
        for (i = desc->cache_entry_index; i < array->size; i++) {
-                d_type = DT_UNKNOWN;
+                struct nfs_cache_array_entry *ent;
-                res = filldir(dirent, array->array[i].string.name,
+                ent = &array->array[i];
-                        array->array[i].string.len, file->f_pos,
+                if (filldir(dirent, ent->string.name, ent->string.len,
-                        nfs_compat_user_ino64(array->array[i].ino), d_type);
+                    file->f_pos, nfs_compat_user_ino64(ent->ino),
-                if (res < 0)
+                    ent->d_type) < 0) {
+                        desc->eof = 1;
                        break;
+                }
                file->f_pos++;
-                desc->cache_entry_index = i;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        break;
-                }
        }
+        if (array->eof_index >= 0)
+                desc->eof = 1;
        nfs_readdir_release_array(desc->page);
+out:
        cache_page_release(desc);
-        if (dentry != NULL)
-                dput(dentry);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
                        (unsigned long long)*desc->dir_cookie, res);
        return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                goto out;
        }
-        if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
-                status = -EIO;
-                goto out_release;
-        }
        desc->page_index = 0;
+        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+        status = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (status < 0)
+                goto out_release;
        status = nfs_do_filldir(desc, dirent, filldir);
 out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        int res = -ENOMEM;
+        int res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (res < 0)
                goto out;
-        while (desc->eof != 1) {
+        do {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
+                        res = 0;
                        /* This means either end of directory */
                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
-                                if (res >= 0)
+                                if (res == 0)
                                        continue;
                        }
-                        res = 0;
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        break;
                res = nfs_do_filldir(desc, dirent, filldir);
-                if (res < 0) {
+                if (res < 0)
-                        res = 0;
                        break;
-                }
+        } while (!desc->eof);
-        }
 out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
@@ -912,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 * component of the path.
 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
 */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+                                                unsigned int mask)
 {
        if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
                return 0;
@@ -943,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+        if (IS_AUTOMOUNT(inode))
                return 0;
        if (nd != NULL) {
                /* VFS wants an on-the-wire revalidation */
@@ -992,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 * If the parent directory is seen to have changed, we throw out the
 * cached dentry and do a new lookup.
 */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
        struct inode *inode;
@@ -1001,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr *fattr = NULL;
        int error;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -1091,7 +1121,7 @@ out_error:
 /*
 * This is called from dput() when d_count is going to 0.
 */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
        dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -1143,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1162,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
@@ -1191,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out_unblock_sillyrename;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-        res = (struct dentry *)inode;
+        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1218,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_revalidate   = nfs_open_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 /*
@@ -1307,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                res = ERR_PTR(-ENAMETOOLONG);
                goto out;
        }
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
         * the dentry. */
@@ -1325,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
+                attr.ia_mode &= ~current_umask();
-                        attr.ia_mode &= ~current_umask();
        } else {
                open_flags &= ~(O_EXCL | O_CREAT);
                attr.ia_valid = 0;
@@ -1345,12 +1373,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                res = ERR_CAST(inode);
@@ -1380,11 +1408,15 @@ no_open:
 static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *parent = NULL;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct inode *dir;
        struct nfs_open_context *ctx;
        int openflags, ret = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
@@ -1553,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
+        int open_flags = 0;
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1560,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+        if ((nd->flags & LOOKUP_CREATE) != 0)
+                open_flags = nd->intent.open.flags;
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
        if (error != 0)
                goto out_err;
        return 0;
@@ -1692,11 +1728,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) > 1) {
+        if (dentry->d_count > 1) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
@@ -1707,7 +1741,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                need_rehash = 1;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
        if (!error || error == -ENOENT) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1842,7 +1875,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                 atomic_read(&new_dentry->d_count));
+                 new_dentry->d_count);
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1860,7 +1893,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
-                if (atomic_read(&new_dentry->d_count) > 2) {
+                if (new_dentry->d_count > 2) {
                        int err;
                        /* copy the target dentry's name */
@@ -2162,11 +2195,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
        return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct rpc_cred *cred;
        int res = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2214,7 +2250,7 @@ out:
 out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
-                res = generic_permission(inode, mask, NULL);
+                res = generic_permission(inode, mask, flags, NULL);
        goto out;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b9020..9943a75bb6d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_write_complete(dreq, dreq->inode);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -867,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        nfs_alloc_commit_data(dreq);
-        if (dreq->commit_data == NULL || count < wsize)
+        if (dreq->commit_data == NULL || count <= wsize)
                sync = NFS_FILE_SYNC;
        dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f131..7bf029ef408 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
+        unsigned int saved_type = fl->fl_type;
        /* Try local locking first */
        posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
                /* found a conflict */
                goto out;
        }
+        fl->fl_type = saved_type;
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index ac7b814ce16..b5ffe8fa291 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                 * This again causes shrink_dcache_for_umount_subtree() to
                 * Oops, since the test for IS_ROOT() will fail.
                 */
-                spin_lock(&dcache_lock);
+                spin_lock(&sb->s_root->d_inode->i_lock);
+                spin_lock(&sb->s_root->d_lock);
                list_del_init(&sb->s_root->d_alias);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&sb->s_root->d_lock);
+                spin_unlock(&sb->s_root->d_inode->i_lock);
        }
        return 0;
 }
@@ -117,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
 out:
        nfs_free_fattr(fsinfo.fattr);
        return ret;
@@ -225,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
 out:
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b138..18696882f1c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
 }
-#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f5716460..1cc600e77bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
+                        inode->i_data.a_ops = &nfs_dir_aops;
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
@@ -299,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                else
                                        inode->i_op = &nfs_mountpoint_inode_operations;
                                inode->i_fop = NULL;
-                                set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+                                inode->i_flags |= S_AUTOMOUNT;
                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
@@ -880,9 +881,10 @@ out:
        return ret;
 }
-static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        unsigned long ret = 0;
        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -890,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->change_attr = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_CTIME)
-                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
-                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
        if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+                memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-                        if (S_ISDIR(inode->i_mode))
+                if (S_ISDIR(inode->i_mode))
-                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
                        && (fattr->valid & NFS_ATTR_FATTR_SIZE)
                        && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-                        && nfsi->npages == 0)
+                        && nfsi->npages == 0) {
-                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
+        return ret;
 }
 /**
@@ -1207,7 +1216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Update the fsid? */
        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+                        !IS_AUTOMOUNT(inode))
                server->fsid = fattr->fsid;
        /*
@@ -1222,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
-        nfs_wcc_update_inode(inode, fattr);
+        invalid |= nfs_wcc_update_inode(inode, fattr);
        /* More cache consistency checks */
        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1409,9 +1418,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
+        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
-        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1437,11 +1446,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+void nfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nfs_i_callback);
+}
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
@@ -1611,6 +1627,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
+        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff45..cf9fdbdabc6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,12 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -185,17 +188,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
@@ -245,6 +251,7 @@ extern char *nfs_path(const char *base,
                      const struct dentry *droot,
                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
@@ -362,6 +369,15 @@ unsigned int nfs_page_length(struct page *page)
 }
 /*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+        return (mode >> 12) & 15;
+}
+/*
 * Determine the number of pages in an array of length 'len' and
 * with a base offset of 'base'
 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f47..d4c2d6b7507 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
                .authflavor     = RPC_AUTH_UNIX,
                .flags          = RPC_CLNT_CREATE_NOPING,
        };
-        struct mountres result;
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
-                .rpc_resp       = &result,
        };
        struct rpc_clnt *clnt;
        int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        clnt = rpc_create(&args);
-        if (unlikely(IS_ERR(clnt)))
+        if (IS_ERR(clnt))
                goto out_clnt_err;
        dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
 * XDR encode/decode functions for MOUNT
 */
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        if (unlikely(pathname_len > MNTPATHLEN))
+        BUG_ON(pathname_len > MNTPATHLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + pathname_len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, pathname, pathname_len);
-        return 0;
 }
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
-                           const char *dirpath)
+                                const char *dirpath)
 {
-        struct xdr_stream xdr;
+        encode_mntdirpath(xdr, dirpath);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mntdirpath(&xdr, dirpath);
 }
 /*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
-                            struct mountres *res)
+                                struct xdr_stream *xdr,
+                                struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(xdr, res);
-        status = decode_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        return decode_fhandle(&xdr, res);
+        return decode_fhandle(xdr, res);
 }
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
        u32 size;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(size));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        size = ntohl(*p++);
+        size = be32_to_cpup(p);
        if (size > NFS3_FHSIZE || size == 0)
                return -EIO;
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        if (*count == 0)
                return 0;
-        p = xdr_inline_decode(xdr, sizeof(entries));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        entries = ntohl(*p);
+        entries = be32_to_cpup(p);
        dprintk("NFS: received %u auth flavors\n", entries);
        if (entries > NFS_MAX_SECFLAVORS)
                entries = NFS_MAX_SECFLAVORS;
-        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        p = xdr_inline_decode(xdr, 4 * entries);
        if (unlikely(p == NULL))
                return -EIO;
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
                entries = *count;
        for (i = 0; i < entries; i++) {
-                flavors[i] = ntohl(*p++);
+                flavors[i] = be32_to_cpup(p++);
                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
-                             struct mountres *res)
+                                 struct xdr_stream *xdr,
+                                 struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(xdr, res);
-        status = decode_fhs_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        status = decode_fhandle3(&xdr, res);
+        status = decode_fhandle3(xdr, res);
        if (unlikely(status != 0)) {
                res->errno = -EBADHANDLE;
                return 0;
        }
-        return decode_auth_flavors(&xdr, res);
+        return decode_auth_flavors(xdr, res);
 }
 static struct rpc_procinfo mnt_procedures[] = {
        [MOUNTPROC_MNT] = {
                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres_sz,
                .p_statidx      = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
        },
        [MOUNTPROC_UMNT] = {
                .p_proc         = MOUNTPROC_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC_UMNT,
                .p_name         = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres3,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
        },
        [MOUNTPROC3_UMNT] = {
                .p_proc         = MOUNTPROC3_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC3_UMNT,
                .p_name         = "UMOUNT",
@@ -505,13 +490,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 static struct rpc_version mnt_version1 = {
        .number         = 1,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt_procedures),
        .procs          = mnt_procedures,
 };
 static struct rpc_version mnt_version3 = {
        .number         = 3,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt3_procedures),
        .procs          = mnt3_procedures,
 };
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf..f32b8603dca 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
-        char *end = buffer+buflen;
+        char *end;
        int namelen;
+        unsigned seq;
+rename_retry:
+        end = buffer+buflen;
        *--end = '\0';
        buflen--;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
                *--end = '/';
                dentry = dentry->d_parent;
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        if (*end != '/') {
                if (--buflen < 0)
                        goto Elong;
@@ -82,15 +89,16 @@ char *nfs_path(const char *base,
        memcpy(end, base, namelen);
        return end;
 Elong_unlock:
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * nfs_d_automount - Handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
+ * @path - The mountpoint
- * @nd - nameidata info
 *
 * When we encounter a mountpoint on the server, we want to set up
 * a mountpoint on the client too, to prevent inode numbers from
@@ -100,87 +108,65 @@ Elong:
 * situation, and that different filesystems may want to use
 * different security flavours.
 */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
        struct vfsmount *mnt;
-        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+        struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
        struct dentry *parent;
        struct nfs_fh *fh = NULL;
        struct nfs_fattr *fattr = NULL;
        int err;
-        dprintk("--> nfs_follow_mountpoint()\n");
+        dprintk("--> nfs_d_automount()\n");
-        err = -ESTALE;
+        mnt = ERR_PTR(-ESTALE);
-        if (IS_ROOT(dentry))
+        if (IS_ROOT(path->dentry))
-                goto out_err;
+                goto out_nofree;
-        err = -ENOMEM;
+        mnt = ERR_PTR(-ENOMEM);
        fh = nfs_alloc_fhandle();
        fattr = nfs_alloc_fattr();
        if (fh == NULL || fattr == NULL)
-                goto out_err;
+                goto out;
        dprintk("%s: enter\n", __func__);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        /* Look it up again */
+        /* Look it up again to get its attributes */
-        parent = dget_parent(nd->path.dentry);
+        parent = dget_parent(path->dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-                                                  &nd->path.dentry->d_name,
+                                                  &path->dentry->d_name,
                                                  fh, fattr);
        dput(parent);
-        if (err != 0)
+        if (err != 0) {
-                goto out_err;
+                mnt = ERR_PTR(err);
+                goto out;
+        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+                mnt = nfs_do_refmount(path->mnt, path->dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
+                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
-                                      fattr);
-        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
-                goto out_err;
+                goto out;
-        mntget(mnt);
+        dprintk("%s: done, success\n", __func__);
-        err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+        mntget(mnt); /* prevent immediate expiration */
-                           &nfs_automount_list);
+        mnt_set_expiry(mnt, &nfs_automount_list);
-        if (err < 0) {
-                mntput(mnt);
-                if (err == -EBUSY)
-                        goto out_follow;
-                goto out_err;
-        }
-        path_put(&nd->path);
-        nd->path.mnt = mnt;
-        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
-        dprintk("%s: done, returned %d\n", __func__, err);
+out_nofree:
+        dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
-        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
+        return mnt;
-        return ERR_PTR(err);
-out_err:
-        path_put(&nd->path);
-        goto out;
-out_follow:
-        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path))
-                ;
-        err = 0;
-        goto out;
 }
 const struct inode_operations nfs_mountpoint_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
        .getattr        = nfs_getattr,
 };
 const struct inode_operations nfs_referral_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
 };
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc..792cb13a430 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,582 +61,1008 @@
 #define NFS_readdirres_sz       (1)
 #define NFS_statfsres_sz        (1+NFS_info_sz)
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
 */
-static inline __be32 *
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
 {
-        memcpy(p, fhandle->data, NFS2_FHSIZE);
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        /* NFSv2 handles have a fixed length */
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        fhandle->size = NFS2_FHSIZE;
+                "Remaining buffer length is %tu words.\n",
-        memcpy(fhandle->data, p, NFS2_FHSIZE);
+                func, xdr->end - xdr->p);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline __be32*
-xdr_encode_time(__be32 *p, struct timespec *timep)
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+/*
+ *      typedef opaque  nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
 {
-        *p++ = htonl(timep->tv_sec);
+        u32 recvd, count;
-        /* Convert nanoseconds into microseconds */
+        size_t hdrlen;
-        *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = 0;        /* NFSv2 does not pass EOF flag on the wire. */
+        result->count = count;
+        return count;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      enum stat {
+ *              NFS_OK = 0,
+ *              NFSERR_PERM = 1,
+ *              NFSERR_NOENT = 2,
+ *              NFSERR_IO = 5,
+ *              NFSERR_NXIO = 6,
+ *              NFSERR_ACCES = 13,
+ *              NFSERR_EXIST = 17,
+ *              NFSERR_NODEV = 19,
+ *              NFSERR_NOTDIR = 20,
+ *              NFSERR_ISDIR = 21,
+ *              NFSERR_FBIG = 27,
+ *              NFSERR_NOSPC = 28,
+ *              NFSERR_ROFS = 30,
+ *              NFSERR_NAMETOOLONG = 63,
+ *              NFSERR_NOTEMPTY = 66,
+ *              NFSERR_DQUOT = 69,
+ *              NFSERR_STALE = 70,
+ *              NFSERR_WFLUSH = 99
+ *      };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.2.  ftype
+ *
+ *      enum ftype {
+ *              NFNON = 0,
+ *              NFREG = 1,
+ *              NFDIR = 2,
+ *              NFBLK = 3,
+ *              NFCHR = 4,
+ *              NFLNK = 5
+ *      };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+        *type = be32_to_cpup(p++);
+        if (unlikely(*type > NF2FIFO))
+                *type = NFBAD;
        return p;
 }
-static inline __be32*
+/*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+ * 2.3.3.  fhandle
+ *
+ *      typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+        memcpy(p, fh->data, NFS2_FHSIZE);
+}
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
-        /*
+        __be32 *p;
-         * Passing the invalid value useconds=1000000 is a
-         * Sun convention for "set to current server time".
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
-         * It's needed to make permissions checks for the
+        if (unlikely(p == NULL))
-         * "touch" program across v2 mounts to Solaris and
+                goto out_overflow;
-         * Irix boxes work correctly. See description of
+        fh->size = NFS2_FHSIZE;
-         * sattr in section 6.1 of "NFS Illustrated" by
+        memcpy(fh->data, p, NFS2_FHSIZE);
-         * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+        return 0;
-         */
+out_overflow:
-        *p++ = htonl(timep->tv_sec);
+        print_overflow_msg(__func__, xdr);
-        *p++ = htonl(1000000);
+        return -EIO;
+}
+/*
+ * 2.3.4.  timeval
+ *
+ *      struct timeval {
+ *              unsigned int seconds;
+ *              unsigned int useconds;
+ *      };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        if (timep->tv_nsec != 0)
+                *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+        else
+                *p++ = cpu_to_be32(0);
+        return p;
+}
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+                                              const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        *p++ = cpu_to_be32(1000000);
        return p;
 }
-static inline __be32*
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
-xdr_decode_time(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        timep->tv_sec = be32_to_cpup(p++);
-        /* Convert microseconds into nanoseconds */
+        timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
-        timep->tv_nsec = ntohl(*p++) * 1000;
        return p;
 }
-static __be32 *
+/*
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+ * 2.3.5.  fattr
+ *
+ *      struct fattr {
+ *              ftype           type;
+ *              unsigned int    mode;
+ *              unsigned int    nlink;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              unsigned int    blocksize;
+ *              unsigned int    rdev;
+ *              unsigned int    blocks;
+ *              unsigned int    fsid;
+ *              unsigned int    fileid;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *              timeval         ctime;
+ *      };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        u32 rdev, type;
-        type = ntohl(*p++);
+        __be32 *p;
-        fattr->mode = ntohl(*p++);
-        fattr->nlink = ntohl(*p++);
+        p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-        fattr->uid = ntohl(*p++);
+        if (unlikely(p == NULL))
-        fattr->gid = ntohl(*p++);
+                goto out_overflow;
-        fattr->size = ntohl(*p++);
-        fattr->du.nfs2.blocksize = ntohl(*p++);
-        rdev = ntohl(*p++);
-        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid.major = ntohl(*p++);
-        fattr->fsid.minor = 0;
-        fattr->fileid = ntohl(*p++);
-        p = xdr_decode_time(p, &fattr->atime);
-        p = xdr_decode_time(p, &fattr->mtime);
-        p = xdr_decode_time(p, &fattr->ctime);
        fattr->valid |= NFS_ATTR_FATTR_V2;
+        p = xdr_decode_ftype(p, &type);
+        fattr->mode = be32_to_cpup(p++);
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        fattr->size = be32_to_cpup(p++);
+        fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+        rdev = be32_to_cpup(p++);
        fattr->rdev = new_decode_dev(rdev);
-        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
+        fattr->du.nfs2.blocks = be32_to_cpup(p++);
+        fattr->fsid.major = be32_to_cpup(p++);
+        fattr->fsid.minor = 0;
+        fattr->fileid = be32_to_cpup(p++);
+        p = xdr_decode_time(p, &fattr->atime);
+        p = xdr_decode_time(p, &fattr->mtime);
+        xdr_decode_time(p, &fattr->ctime);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.6.  sattr
+ *
+ *      struct sattr {
+ *              unsigned int    mode;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *      };
+ */
+#define NFS2_SATTR_NOT_SET      (0xffffffff)
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
        return p;
 }
-static inline __be32 *
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+        __be32 *p;
-        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+        p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
-        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
-        if (attr->ia_valid & ATTR_ATIME_SET) {
+        if (attr->ia_valid & ATTR_MODE)
+                *p++ = cpu_to_be32(attr->ia_mode);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_UID)
+                *p++ = cpu_to_be32(attr->ia_uid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_GID)
+                *p++ = cpu_to_be32(attr->ia_gid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_SIZE)
+                *p++ = cpu_to_be32((u32)attr->ia_size);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_ATIME_SET)
                p = xdr_encode_time(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
+        else if (attr->ia_valid & ATTR_ATIME)
                p = xdr_encode_current_server_time(p, &attr->ia_atime);
-        } else {
+        else
-                *p++ = not_set;
+                p = xdr_time_not_set(p);
-                *p++ = not_set;
+        if (attr->ia_valid & ATTR_MTIME_SET)
-        }
+                xdr_encode_time(p, &attr->ia_mtime);
+        else if (attr->ia_valid & ATTR_MTIME)
-        if (attr->ia_valid & ATTR_MTIME_SET) {
+                xdr_encode_current_server_time(p, &attr->ia_mtime);
-                p = xdr_encode_time(p, &attr->ia_mtime);
+        else
-        } else if (attr->ia_valid & ATTR_MTIME) {
+                xdr_time_not_set(p);
-                p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-        } else {
-                *p++ = not_set; 
-                *p++ = not_set;
-        }
-        return p;
 }
 /*
- * NFS encode functions
+ * 2.3.7.  filename
+ *
+ *      typedef string filename<MAXNAMLEN>;
 */
+static void encode_filename(struct xdr_stream *xdr,
+                            const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+static int decode_filename_inline(struct xdr_stream *xdr,
+                                  const char **name, u32 *length)
+{
+        __be32 *p;
+        u32 count;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Encode file handle argument
+ * 2.3.8.  path
- * GETATTR, READLINK, STATFS
+ *
+ *      typedef string path<MAXPATHLEN>;
 */
-static int
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXPATHLEN);
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_path(struct xdr_stream *xdr)
 {
-        p = xdr_encode_fhandle(p, fh);
+        u32 length, recvd;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p);
+        if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+                goto out_size;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(length > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, length);
+        xdr_terminate_string(xdr->buf, length);
        return 0;
+out_size:
+        dprintk("NFS: returned pathname too long: %u\n", length);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "length %u > received %u\n", length, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * 2.3.9.  attrstat
+ *
+ *      union attrstat switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        enum nfs_stat status;
-        p = xdr_encode_sattr(p, args->sattr);
+        int error;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Encode directory ops argument
+ * 2.3.10.  diropargs
- * LOOKUP, RMDIR
+ *
+ *      struct diropargs {
+ *              fhandle  dir;
+ *              filename name;
+ *      };
 */
-static int
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
+                             const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, fh);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_filename(xdr, name, length);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 2.3.11.  diropres
+ *
+ *      union diropres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      fhandle file;
+ *                      fattr   attributes;
+ *              } diropok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        int error;
-        p = xdr_encode_array(p, args->name.name, args->name.len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        error = decode_fhandle(xdr, result->fh);
-        return 0;
+        if (unlikely(error))
+                goto out;
+        error = decode_fattr(xdr, result->fattr);
+out:
+        return error;
+}
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_diropok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * NFSv2 XDR encode functions
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nfs_fh *fh)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_fhandle(xdr, fh);
-        unsigned int replen;
+}
-        u32 offset = (u32)args->offset;
+/*
+ * 2.2.3.  sattrargs
+ *
+ *      struct sattrargs {
+ *              fhandle file;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_sattrargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        encode_sattr(xdr, args->sattr);
+}
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_diropargs *args)
+{
+        encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_readlinkargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS_readlinkres_sz);
+}
+/*
+ * 2.2.7.  readargs
+ *
+ *      struct readargs {
+ *              fhandle file;
+ *              unsigned offset;
+ *              unsigned count;
+ *              unsigned totalcount;
+ *      };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+                            const struct nfs_readargs *args)
+{
+        u32 offset = args->offset;
        u32 count = args->count;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-        *p++ = htonl(count);
+        *p++ = cpu_to_be32(offset);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        *p++ = cpu_to_be32(count);
+        *p = cpu_to_be32(count);
+}
-        /* Inline the page array */
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
+                                  struct xdr_stream *xdr,
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+                                  const struct nfs_readargs *args)
-                         args->pages, args->pgbase, count);
+{
+        encode_readargs(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Decode READ reply
+ * 2.2.9.  writeargs
+ *
+ *      struct writeargs {
+ *              fhandle file;
+ *              unsigned beginoffset;
+ *              unsigned offset;
+ *              unsigned totalcount;
+ *              nfsdata data;
+ *      };
 */
-static int
+static void encode_writeargs(struct xdr_stream *xdr,
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                             const struct nfs_writeargs *args)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        u32 offset = args->offset;
-        size_t hdrlen;
+        u32 count = args->count;
-        u32 count, recvd;
+        __be32 *p;
-        int status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_fattr(p, res->fattr);
-        count = ntohl(*p++);
-        res->eof = 0;
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READ reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        encode_fhandle(xdr, args->fh);
-        if (count > recvd) {
-                dprintk("NFS: server cheating in read reply: "
-                        "count %u > recvd %u\n", count, recvd);
-                count = recvd;
-        }
-        dprintk("RPC:      readres OK count %u\n", count);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        if (count < res->count)
+        *p++ = cpu_to_be32(offset);
-                res->count = count;
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(count);
-        return count;
+        /* nfsdata */
+        *p = cpu_to_be32(count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_writeargs *args)
+{
+        encode_writeargs(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
+}
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.10.  createargs
+ *
+ *      struct createargs {
+ *              diropargs where;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_createargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        encode_diropargs(xdr, args->fh, args->name, args->len);
-        u32 offset = (u32)args->offset;
+        encode_sattr(xdr, args->sattr);
-        u32 count = args->count;
+}
-        p = xdr_encode_fhandle(p, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+                                    struct xdr_stream *xdr,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    const struct nfs_removeargs *args)
-        return 0;
+{
+        encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 /*
- * Encode create arguments
+ * 2.2.12.  renameargs
- * CREATE, MKDIR
+ *
+ *      struct renameargs {
+ *              diropargs from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_array(p, args->name, args->len);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode RENAME arguments
+ * 2.2.13.  linkargs
+ *
+ *      struct linkargs {
+ *              fhandle from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->old_dir);
+        encode_fhandle(xdr, args->fromfh);
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 2.2.14.  symlinkargs
+ *
+ *      struct symlinkargs {
+ *              diropargs from;
+ *              path to;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
-        p = xdr_encode_fhandle(p, args->tofh);
+        encode_path(xdr, args->pages, args->pathlen);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        encode_sattr(xdr, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 2.2.17.  readdirargs
+ *
+ *      struct readdirargs {
+ *              fhandle dir;
+ *              nfscookie cookie;
+ *              unsigned count;
+ *      };
 */
-static int
+static void encode_readdirargs(struct xdr_stream *xdr,
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+                               const struct nfs_readdirargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        size_t pad;
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_fhandle(xdr, args->fh);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        *p++ = htonl(args->pathlen);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        *p++ = cpu_to_be32(args->cookie);
+        *p = cpu_to_be32(args->count);
+}
-        /*
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
-         * xdr_encode_pages may have added a few bytes to ensure the
+                                     struct xdr_stream *xdr,
-         * pathname ends on a 4-byte boundary.  Start encoding the
+                                     const struct nfs_readdirargs *args)
-         * attributes after the pad bytes.
+{
-         */
+        encode_readdirargs(xdr, args);
-        pad = sndbuf->tail->iov_len;
+        prepare_reply_buffer(req, args->pages, 0,
-        if (pad > 0)
+                                        args->count, NFS_readdirres_sz);
-                p++;
-        p = xdr_encode_sattr(p, args->sattr);
-        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-        return 0;
 }
 /*
- * Encode arguments to readdir call
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             void *__unused)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        u32 count = args->count;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
-        *p++ = htonl(args->cookie);
+                                 struct nfs_fattr *result)
-        *p++ = htonl(count); /* see above */
+{
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        return decode_attrstat(xdr, result);
+}
-        /* Inline the page array */
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
+                                 struct nfs_diropok *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
+{
-        return 0;
+        return decode_diropres(xdr, result);
 }
 /*
- * Decode the result of a readdir call.
+ * 2.2.6.  readlinkres
- * We're not really decoding anymore, we just leave the buffer untouched
+ *
- * and only check that it is syntactically correct.
+ *      union readlinkres switch (stat status) {
- * The real decoding happens in nfs_decode_entry below, called directly
+ *      case NFS_OK:
- * from nfs_readdir for each entry.
+ *              path data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+                                    struct xdr_stream *xdr, void *__unused)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        enum nfs_stat status;
-        struct kvec *iov = rcvbuf->head;
+        int error;
-        struct page **page;
-        size_t hdrlen;
+        error = decode_stat(xdr, &status);
-        unsigned int pglen, recvd;
+        if (unlikely(error))
-        int status, nr = 0;
+                goto out;
+        if (status != NFS_OK)
-        if ((status = ntohl(*p++)))
+                goto out_default;
-                return nfs_stat_to_errno(status);
+        error = decode_path(xdr);
+out:
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        return error;
-        if (iov->iov_len < hdrlen) {
+out_default:
-                dprintk("NFS: READDIR reply header overflowed:"
+        return nfs_stat_to_errno(status);
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+}
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+/*
-        recvd = rcvbuf->len - hdrlen;
+ * 2.2.7.  readres
-        if (pglen > recvd)
+ *
-                pglen = recvd;
+ *      union readres switch (stat status) {
-        page = rcvbuf->pages;
+ *      case NFS_OK:
-        return nr;
+ *              fattr attributes;
+ *              nfsdata data;
+ *      default:
+ *              void;
+ *      };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_readres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_nfsdata(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_writeres *result)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        /* All NFSv2 writes are "file sync" writes */
-                "Remaining buffer length is %tu words.\n",
+        result->verf->committed = NFS_FILE_SYNC;
-                func, xdr->end - xdr->p);
+        return decode_attrstat(xdr, result->fattr);
 }
-__be32 *
+/**
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *      struct entry {
+ *              unsigned        fileid;
+ *              filename        name;
+ *              nfscookie       cookie;
+ *              entry           *nextentry;
+ *      };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        __be32 *p;
+        int error;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p++ == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
+                if (unlikely(p == NULL))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p++ == xdr_zero)
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
-        p = xdr_inline_decode(xdr, 8);
+        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
+        entry->ino = be32_to_cpup(p);
-        entry->ino        = ntohl(*p++);
+        error = decode_filename_inline(xdr, &entry->name, &entry->len);
-        entry->len        = ntohl(*p++);
+        if (unlikely(error))
+                return error;
-        p = xdr_inline_decode(xdr, entry->len + 4);
+        /*
-        if (unlikely(!p))
+         * The type (size and byte order) of nfscookie isn't defined in
+         * RFC 1094.  This implementation assumes that it's an XDR uint32.
+         */
+        entry->prev_cookie = entry->cookie;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        entry->name       = (const char *) p;
+        entry->cookie = be32_to_cpup(p);
-        p                += XDR_QUADLEN(entry->len);
-        entry->prev_cookie        = entry->cookie;
-        entry->cookie     = ntohl(*p++);
-        p = xdr_inline_peek(xdr, 8);
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
+        entry->d_type = DT_UNKNOWN;
+        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EAGAIN;
-}
-/*
- * NFS XDR decode functions
- */
-/*
- * Decode simple status reply
- */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
-{
-        int     status;
-        if ((status = ntohl(*p++)) != 0)
-                status = nfs_stat_to_errno(status);
-        return status;
-}
-/*
- * Decode attrstat reply
- * GETATTR, SETATTR, WRITE
- */
-static int
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
-{
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        xdr_decode_fattr(p, fattr);
-        return 0;
 }
 /*
- * Decode diropres reply
+ * 2.2.17.  readdirres
- * LOOKUP, CREATE, MKDIR
+ *
+ *      union readdirres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      entry *entries;
+ *                      bool eof;
+ *              } readdirok;
+ *      default:
+ *              void;
+ *      };
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_readdirok(struct xdr_stream *xdr)
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)))
+        pglen = xdr->buf->page_len;
-                return nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_fhandle(p, res->fh);
+        recvd = xdr->buf->len - hdrlen;
-        xdr_decode_fattr(p, res->fattr);
+        if (unlikely(pglen > recvd))
-        return 0;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
- * Encode READLINK args
+                                   struct xdr_stream *xdr, void *__unused)
- */
-static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        p = xdr_encode_fhandle(p, args->fh);
+        error = decode_stat(xdr, &status);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(error))
+                goto out;
-        /* Inline the page array */
+        if (status != NFS_OK)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
+                goto out_default;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+        error = decode_readdirok(xdr);
-        return 0;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 2.2.18.  statfsres
+ *
+ *      union statfsres (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      unsigned tsize;
+ *                      unsigned bsize;
+ *                      unsigned blocks;
+ *                      unsigned bfree;
+ *                      unsigned bavail;
+ *              } info;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        xdr_terminate_string(rcvbuf, len);
+        p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->tsize  = be32_to_cpup(p++);
+        result->bsize  = be32_to_cpup(p++);
+        result->blocks = be32_to_cpup(p++);
+        result->bfree  = be32_to_cpup(p++);
+        result->bavail = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-/*
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
- * Decode WRITE reply
+                                  struct nfs2_fsstat *result)
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
-        res->verf->committed = NFS_FILE_SYNC;
+        enum nfs_stat status;
-        return nfs_xdr_attrstat(req, p, res->fattr);
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_info(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-/*
- * Decode STATFS reply
- */
-static int
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
-{
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        res->tsize  = ntohl(*p++);
-        res->bsize  = ntohl(*p++);
-        res->blocks = ntohl(*p++);
-        res->bfree  = ntohl(*p++);
-        res->bavail = ntohl(*p++);
-        return 0;
-}
 /*
 * We need to translate between nfs status return values and
 * the local errno values which may not be the same.
 */
-static struct {
+static const struct {
        int stat;
        int errno;
 } nfs_errtbl[] = {
@@ -676,28 +1102,30 @@ static struct {
        { -1,                   -EIO            }
 };
-/*
+/**
- * Convert an NFS error code to a local one.
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
- * This one is used jointly by NFSv2 and NFSv3.
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
 */
-int
+int nfs_stat_to_errno(enum nfs_stat status)
-nfs_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-                if (nfs_errtbl[i].stat == stat)
+                if (nfs_errtbl[i].stat == (int)status)
                        return nfs_errtbl[i].errno;
        }
-        dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+        dprintk("NFS: Unrecognized nfs status value: %u\n", status);
        return nfs_errtbl[i].errno;
 }
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
-        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
+        .p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,             \
-        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
+        .p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,             \
        .p_arglen   =  NFS_##argtype##_sz,                              \
        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
@@ -705,21 +1133,21 @@ nfs_stat_to_errno(int stat)
        .p_name     =  #proc,                                           \
        }
 struct rpc_procinfo     nfs_procedures[] = {
-    PROC(GETATTR,       fhandle,        attrstat, 1),
+        PROC(GETATTR,   fhandle,        attrstat,       1),
-    PROC(SETATTR,       sattrargs,      attrstat, 0),
+        PROC(SETATTR,   sattrargs,      attrstat,       0),
-    PROC(LOOKUP,        diropargs,      diropres, 2),
+        PROC(LOOKUP,    diropargs,      diropres,       2),
-    PROC(READLINK,      readlinkargs,   readlinkres, 3),
+        PROC(READLINK,  readlinkargs,   readlinkres,    3),
-    PROC(READ,          readargs,       readres, 3),
+        PROC(READ,      readargs,       readres,        3),
-    PROC(WRITE,         writeargs,      writeres, 4),
+        PROC(WRITE,     writeargs,      writeres,       4),
-    PROC(CREATE,        createargs,     diropres, 0),
+        PROC(CREATE,    createargs,     diropres,       0),
-    PROC(REMOVE,        removeargs,     stat, 0),
+        PROC(REMOVE,    removeargs,     stat,           0),
-    PROC(RENAME,        renameargs,     stat, 0),
+        PROC(RENAME,    renameargs,     stat,           0),
-    PROC(LINK,          linkargs,       stat, 0),
+        PROC(LINK,      linkargs,       stat,           0),
-    PROC(SYMLINK,       symlinkargs,    stat, 0),
+        PROC(SYMLINK,   symlinkargs,    stat,           0),
-    PROC(MKDIR,         createargs,     diropres, 0),
+        PROC(MKDIR,     createargs,     diropres,       0),
-    PROC(RMDIR,         diropargs,      stat, 0),
+        PROC(RMDIR,     diropargs,      stat,           0),
-    PROC(READDIR,       readdirargs,    readdirres, 3),
+        PROC(READDIR,   readdirargs,    readdirres,     3),
-    PROC(STATFS,        fhandle,        statfsres, 0),
+        PROC(STATFS,    fhandle,        statfsres,      0),
 };
 struct rpc_version              nfs_version2 = {
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e..27434277165 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
                goto out;
-        /* We are doing this here, because XDR marshalling can only
+        /* We are doing this here because XDR marshalling does not
-           return -ENOMEM. */
+         * return any results, it BUGs. */
        status = -ENOSPC;
        if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
                goto out;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c25..183c6b123d0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz        (1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz            (1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz           (21)
-#define NFS3_wcc_attr_sz                (6)
+#define NFS3_cookieverf_sz      (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz        (6)
 #define NFS3_pre_op_attr_sz     (1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz    (1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz                (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_wcc_data_sz        (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz          
-#define NFS3_fsinfo_sz          
-#define NFS3_pathconf_sz                
-#define NFS3_entry_sz           (NFS3_filename_sz+3)
-#define NFS3_sattrargs_sz       (NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz       (NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_getattrargs_sz     (NFS3_fh_sz)
+#define NFS3_setattrargs_sz     (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz      (NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz    (NFS3_fh_sz)
 #define NFS3_readargs_sz        (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz     (NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz     (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz      (NFS3_fh_sz+3)
-#define NFS3_attrstat_sz        (1+NFS3_fattr_sz)
+#define NFS3_getattrres_sz      (1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz         (1+NFS3_wcc_data_sz)
+#define NFS3_setattrres_sz      (1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz       (NFS3_wccstat_sz)
+#define NFS3_removeres_sz       (NFS3_setattrres_sz)
 #define NFS3_lookupres_sz       (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz       (1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz     (1+NFS3_post_op_attr_sz+1)
@@ -100,1077 +100,2365 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
+{
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
                "Remaining buffer length is %tu words.\n",
                func, xdr->end - xdr->p);
 }
 /*
- * Common NFS XDR functions as inlines
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
 */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-        return xdr_encode_array(p, fh->data, fh->size);
+        __be32 *p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
 }
-static inline __be32 *
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
-        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
+        __be32 *p;
-                memcpy(fh->data, p, fh->size);
-                return p + XDR_QUADLEN(fh->size);
+        p = xdr_inline_decode(xdr, 4);
-        }
+        if (unlikely(p == NULL))
-        return NULL;
+                goto out_overflow;
+        *value = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        xdr_decode_hyper(p, value);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * fileid3
+ *
+ *      typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+        return xdr_decode_hyper(p, fileid);
+}
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+        return decode_uint64(xdr, fileid);
+}
+/*
+ * filename3
+ *
+ *      typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+                             const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS3_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
 }
-static inline __be32 *
+static int decode_inline_filename3(struct xdr_stream *xdr,
-xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+                                   const char **name, u32 *length)
 {
        __be32 *p;
+        u32 count;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        fh->size = ntohl(*p++);
+        *name = (const char *)p;
+        *length = count;
+        return 0;
-        if (fh->size <= NFS3_FHSIZE) {
+out_nametoolong:
-                p = xdr_inline_decode(xdr, fh->size);
+        dprintk("NFS: returned filename too long: %u\n", count);
-                if (unlikely(!p))
+        return -ENAMETOOLONG;
-                        goto out_overflow;
+out_overflow:
-                memcpy(fh->data, p, fh->size);
+        print_overflow_msg(__func__, xdr);
-                return p + XDR_QUADLEN(fh->size);
+        return -EIO;
-        }
+}
-        return NULL;
+/*
+ * nfspath3
+ *
+ *      typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+                            const u32 length)
+{
+        BUG_ON(length > NFS3_MAXPATHLEN);
+        encode_uint32(xdr, length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+                goto out_nametoolong;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, count);
+        xdr_terminate_string(xdr->buf, count);
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned pathname too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "count %u > recvd %u\n", count, recvd);
+        return -EIO;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
 /*
- * Encode/decode time.
+ * cookie3
+ *
+ *      typedef uint64 cookie3
 */
-static inline __be32 *
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
-xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
-        *p++ = htonl(timep->tv_sec);
+        return xdr_encode_hyper(p, cookie);
-        *p++ = htonl(timep->tv_nsec);
-        return p;
 }
-static inline __be32 *
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
-xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        return decode_uint64(xdr, cookie);
-        timep->tv_nsec = ntohl(*p++);
-        return p;
 }
-static __be32 *
+/*
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+ * cookieverf3
+ *
+ *      typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+        memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+        return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * createverf3
+ *
+ *      typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
 {
-        unsigned int    type, major, minor;
+        __be32 *p;
-        umode_t         fmode;
+        p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+        memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * size3
+ *
+ *      typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+        return xdr_decode_hyper(p, size);
+}
+/*
+ * nfsstat3
+ *
+ *      enum nfsstat3 {
+ *              NFS3_OK = 0,
+ *              ...
+ *      }
+ */
+#define NFS3_OK         NFS_OK
-        type = ntohl(*p++);
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * ftype3
+ *
+ *      enum ftype3 {
+ *              NF3REG  = 1,
+ *              NF3DIR  = 2,
+ *              NF3BLK  = 3,
+ *              NF3CHR  = 4,
+ *              NF3LNK  = 5,
+ *              NF3SOCK = 6,
+ *              NF3FIFO = 7
+ *      };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+        BUG_ON(type > NF3FIFO);
+        encode_uint32(xdr, type);
+}
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+        u32 type;
+        type = be32_to_cpup(p++);
        if (type > NF3FIFO)
                type = NF3NON;
-        fmode = nfs_type2fmt[type];
+        *mode = nfs_type2fmt[type];
-        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
+        return p;
-        fattr->nlink = ntohl(*p++);
+}
-        fattr->uid = ntohl(*p++);
-        fattr->gid = ntohl(*p++);
-        p = xdr_decode_hyper(p, &fattr->size);
-        p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-        /* Turn remote device info into Linux-specific dev_t */
-        major = ntohl(*p++);
-        minor = ntohl(*p++);
-        fattr->rdev = MKDEV(major, minor);
-        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid.major);
+/*
-        fattr->fsid.minor = 0;
+ * specdata3
-        p = xdr_decode_hyper(p, &fattr->fileid);
+ *
-        p = xdr_decode_time3(p, &fattr->atime);
+ *     struct specdata3 {
-        p = xdr_decode_time3(p, &fattr->mtime);
+ *             uint32  specdata1;
-        p = xdr_decode_time3(p, &fattr->ctime);
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+        __be32 *p;
-        /* Update the mode bits */
+        p = xdr_reserve_space(xdr, 8);
-        fattr->valid |= NFS_ATTR_FATTR_V3;
+        *p++ = cpu_to_be32(MAJOR(rdev));
+        *p = cpu_to_be32(MINOR(rdev));
+}
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+        unsigned int major, minor;
+        major = be32_to_cpup(p++);
+        minor = be32_to_cpup(p++);
+        *rdev = MKDEV(major, minor);
+        if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+                *rdev = 0;
+        return p;
+}
+/*
+ * nfs_fh3
+ *
+ *      struct nfs_fh3 {
+ *              opaque       data<NFS3_FHSIZE>;
+ *      };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        p = xdr_reserve_space(xdr, 4 + fh->size);
+        xdr_encode_opaque(p, fh->data, fh->size);
+}
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > NFS3_FHSIZE))
+                goto out_toobig;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = length;
+        memcpy(fh->data, p, length);
+        return 0;
+out_toobig:
+        dprintk("NFS: file handle size (%u) too big\n", length);
+        return -E2BIG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(*fh));
+}
+/*
+ * nfstime3
+ *
+ *      struct nfstime3 {
+ *              uint32  seconds;
+ *              uint32  nseconds;
+ *      };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        *p++ = cpu_to_be32(timep->tv_nsec);
        return p;
 }
-static inline __be32 *
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++);
+        return p;
+}
+/*
+ * sattr3
+ *
+ *      enum time_how {
+ *              DONT_CHANGE             = 0,
+ *              SET_TO_SERVER_TIME      = 1,
+ *              SET_TO_CLIENT_TIME      = 2
+ *      };
+ *
+ *      union set_mode3 switch (bool set_it) {
+ *      case TRUE:
+ *              mode3   mode;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_uid3 switch (bool set_it) {
+ *      case TRUE:
+ *              uid3    uid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_gid3 switch (bool set_it) {
+ *      case TRUE:
+ *              gid3    gid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_size3 switch (bool set_it) {
+ *      case TRUE:
+ *              size3   size;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_atime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3        atime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_mtime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3  mtime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct sattr3 {
+ *              set_mode3       mode;
+ *              set_uid3        uid;
+ *              set_gid3        gid;
+ *              set_size3       size;
+ *              set_atime       atime;
+ *              set_mtime       mtime;
+ *      };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+        u32 nbytes;
+        __be32 *p;
+        /*
+         * In order to make only a single xdr_reserve_space() call,
+         * pre-compute the total number of bytes to be reserved.
+         * Six boolean values, one for each set_foo field, are always
+         * present in the encoded result, so start there.
+         */
+        nbytes = 6 * 4;
+        if (attr->ia_valid & ATTR_MODE)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_UID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_GID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_SIZE)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_ATIME_SET)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_MTIME_SET)
+                nbytes += 8;
+        p = xdr_reserve_space(xdr, nbytes);
        if (attr->ia_valid & ATTR_MODE) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_mode & S_IALLUGO);
+                *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_UID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_uid);
+                *p++ = cpu_to_be32(attr->ia_uid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_GID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_gid);
+                *p++ = cpu_to_be32(attr->ia_gid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_SIZE) {
                *p++ = xdr_one;
-                p = xdr_encode_hyper(p, (__u64) attr->ia_size);
+                p = xdr_encode_hyper(p, (u64)attr->ia_size);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_ATIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_atime);
+                p = xdr_encode_nfstime3(p, &attr->ia_atime);
        } else if (attr->ia_valid & ATTR_ATIME) {
                *p++ = xdr_one;
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_MTIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_mtime);
+                xdr_encode_nfstime3(p, &attr->ia_mtime);
        } else if (attr->ia_valid & ATTR_MTIME) {
-                *p++ = xdr_one;
+                *p = xdr_one;
-        } else {
+        } else
-                *p++ = xdr_zero;
+                *p = xdr_zero;
-        }
-        return p;
 }
-static inline __be32 *
+/*
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+ * fattr3
+ *
+ *      struct fattr3 {
+ *              ftype3          type;
+ *              mode3           mode;
+ *              uint32          nlink;
+ *              uid3            uid;
+ *              gid3            gid;
+ *              size3           size;
+ *              size3           used;
+ *              specdata3       rdev;
+ *              uint64          fsid;
+ *              fileid3         fileid;
+ *              nfstime3        atime;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-        p = xdr_decode_hyper(p, &fattr->pre_size);
+        umode_t fmode;
-        p = xdr_decode_time3(p, &fattr->pre_mtime);
+        __be32 *p;
-        p = xdr_decode_time3(p, &fattr->pre_ctime);
+        p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_ftype3(p, &fmode);
+        fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &fattr->size);
+        p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+        p = xdr_decode_specdata3(p, &fattr->rdev);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
+        p = xdr_decode_fileid3(p, &fattr->fileid);
+        p = xdr_decode_nfstime3(p, &fattr->atime);
+        p = xdr_decode_nfstime3(p, &fattr->mtime);
+        xdr_decode_nfstime3(p, &fattr->ctime);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * post_op_attr
+ *
+ *      union post_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              fattr3  attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_fattr3(xdr, fattr);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * wcc_attr
+ *      struct wcc_attr {
+ *              size3           size;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
                | NFS_ATTR_FATTR_PREMTIME
                | NFS_ATTR_FATTR_PRECTIME;
-        return p;
-}
-static inline __be32 *
+        p = xdr_decode_size3(p, &fattr->pre_size);
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
+        p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
-{
+        xdr_decode_nfstime3(p, &fattr->pre_ctime);
-        if (*p++)
-                p = xdr_decode_fattr(p, fattr);
+        return 0;
-        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+ * pre_op_attr
+ *      union pre_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              wcc_attr        attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ * wcc_data
+ *
+ *      struct wcc_data {
+ *              pre_op_attr     before;
+ *              post_op_attr    after;
+ *      };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        __be32 *p;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (ntohl(*p++)) {
+        if (*p != xdr_zero)
-                p = xdr_inline_decode(xdr, 84);
+                return decode_wcc_attr(xdr, fattr);
-                if (unlikely(!p))
+        return 0;
-                        goto out_overflow;
-                p = xdr_decode_fattr(p, fattr);
-        }
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
-static inline __be32 *
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
-        if (*p++)
+        int error;
-                return xdr_decode_wcc_attr(p, fattr);
-        return p;
+        error = decode_pre_op_attr(xdr, fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, fattr);
+out:
+        return error;
 }
+/*
+ * post_op_fh3
+ *
+ *      union post_op_fh3 switch (bool handle_follows) {
+ *      case TRUE:
+ *              nfs_fh3  handle;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_nfs_fh3(xdr, fh);
+        zero_nfs_fh3(fh);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-static inline __be32 *
+/*
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
+ * diropargs3
+ *
+ *      struct diropargs3 {
+ *              nfs_fh3         dir;
+ *              filename3       name;
+ *      };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+                              const char *name, u32 length)
 {
-        p = xdr_decode_pre_op_attr(p, fattr);
+        encode_nfs_fh3(xdr, fh);
-        return xdr_decode_post_op_attr(p, fattr);
+        encode_filename3(xdr, name, length);
 }
 /*
- * NFS encode functions
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Encode file handle argument
+ * 3.3.1  GETATTR3args
+ *
+ *      struct GETATTR3args {
+ *              nfs_fh3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        encode_nfs_fh3(xdr, fh);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SETATTR arguments
+ * 3.3.2  SETATTR3args
+ *
+ *      union sattrguard3 switch (bool check) {
+ *      case TRUE:
+ *              nfstime3  obj_ctime;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ *      struct SETATTR3args {
+ *              nfs_fh3         object;
+ *              sattr3          new_attributes;
+ *              sattrguard3     guard;
+ *      };
 */
-static int
+static void encode_sattrguard3(struct xdr_stream *xdr,
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
+                               const struct nfs3_sattrargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->guard);
+        if (args->guard) {
-        if (args->guard)
+                p = xdr_reserve_space(xdr, 4 + 8);
-                p = xdr_encode_time3(p, &args->guardtime);
+                *p++ = xdr_one;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                xdr_encode_nfstime3(p, &args->guardtime);
-        return 0;
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                *p = xdr_zero;
+        }
+}
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_sattrargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        encode_sattr3(xdr, args->sattr);
+        encode_sattrguard3(xdr, args);
 }
 /*
- * Encode directory ops argument
+ * 3.3.3  LOOKUP3args
+ *
+ *      struct LOOKUP3args {
+ *              diropargs3  what;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 3.3.4  ACCESS3args
+ *
+ *      struct ACCESS3args {
+ *              nfs_fh3         object;
+ *              uint32          access;
+ *      };
 */
-static int
+static void encode_access3args(struct xdr_stream *xdr,
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+                               const struct nfs3_accessargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_uint32(xdr, args->access);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        return 0;
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_accessargs *args)
+{
+        encode_access3args(xdr, args);
 }
 /*
- * Encode access() argument
+ * 3.3.5  READLINK3args
+ *
+ *      struct READLINK3args {
+ *              nfs_fh3 symlink;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
+                                       struct xdr_stream *xdr,
+                                       const struct nfs3_readlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        *p++ = htonl(args->access);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                                        args->pglen, NFS3_readlinkres_sz);
-        return 0;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * 3.3.6  READ3args
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      struct READ3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_read3args(struct xdr_stream *xdr,
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+                             const struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        /* Inline the page array */
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
+                                   struct xdr_stream *xdr,
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+                                   const struct nfs_readargs *args)
-                         args->pages, args->pgbase, count);
+{
+        encode_read3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS3_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 3.3.7  WRITE3args
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *              stable_how      stable;
+ *              opaque          data<>;
+ *      };
 */
-static int
+static void encode_write3args(struct xdr_stream *xdr,
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                              const struct nfs_writeargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p++ = cpu_to_be32(args->count);
-        *p++ = htonl(args->stable);
+        *p++ = cpu_to_be32(args->stable);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
-        /* Copy the page array */
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    struct xdr_stream *xdr,
-        return 0;
+                                    const struct nfs_writeargs *args)
+{
+        encode_write3args(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
 }
 /*
- * Encode CREATE arguments
+ * 3.3.8  CREATE3args
+ *
+ *      enum createmode3 {
+ *              UNCHECKED = 0,
+ *              GUARDED   = 1,
+ *              EXCLUSIVE = 2
+ *      };
+ *
+ *      union createhow3 switch (createmode3 mode) {
+ *      case UNCHECKED:
+ *      case GUARDED:
+ *              sattr3       obj_attributes;
+ *      case EXCLUSIVE:
+ *              createverf3  verf;
+ *      };
+ *
+ *      struct CREATE3args {
+ *              diropargs3      where;
+ *              createhow3      how;
+ *      };
 */
-static int
+static void encode_createhow3(struct xdr_stream *xdr,
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
+                              const struct nfs3_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_uint32(xdr, args->createmode);
-        p = xdr_encode_array(p, args->name, args->len);
+        switch (args->createmode) {
+        case NFS3_CREATE_UNCHECKED:
-        *p++ = htonl(args->createmode);
+        case NFS3_CREATE_GUARDED:
-        if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
+                encode_sattr3(xdr, args->sattr);
-                *p++ = args->verifier[0];
+                break;
-                *p++ = args->verifier[1];
+        case NFS3_CREATE_EXCLUSIVE:
-        } else
+                encode_createverf3(xdr, args->verifier);
-                p = xdr_encode_sattr(p, args->sattr);
+                break;
+        default:
+                BUG();
+        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
-        return 0;
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_createargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_createhow3(xdr, args);
 }
 /*
- * Encode MKDIR arguments
+ * 3.3.9  MKDIR3args
+ *
+ *      struct MKDIR3args {
+ *              diropargs3      where;
+ *              sattr3          attributes;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mkdirargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 3.3.10  SYMLINK3args
+ *
+ *      struct symlinkdata3 {
+ *              sattr3          symlink_attributes;
+ *              nfspath3        symlink_data;
+ *      };
+ *
+ *      struct SYMLINK3args {
+ *              diropargs3      where;
+ *              symlinkdata3    symlink;
+ *      };
 */
-static int
+static void encode_symlinkdata3(struct xdr_stream *xdr,
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
+                                const struct nfs3_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        encode_nfspath3(xdr, args->pages, args->pathlen);
-        p = xdr_encode_sattr(p, args->sattr);
+}
-        *p++ = htonl(args->pathlen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Copy the page */
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
-        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
+                                      struct xdr_stream *xdr,
-        return 0;
+                                      const struct nfs3_symlinkargs *args)
+{
+        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_symlinkdata3(xdr, args);
 }
 /*
- * Encode MKNOD arguments
+ * 3.3.11  MKNOD3args
+ *
+ *      struct devicedata3 {
+ *              sattr3          dev_attributes;
+ *              specdata3       spec;
+ *      };
+ *
+ *      union mknoddata3 switch (ftype3 type) {
+ *      case NF3CHR:
+ *      case NF3BLK:
+ *              devicedata3     device;
+ *      case NF3SOCK:
+ *      case NF3FIFO:
+ *              sattr3          pipe_attributes;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct MKNOD3args {
+ *              diropargs3      where;
+ *              mknoddata3      what;
+ *      };
 */
-static int
+static void encode_devicedata3(struct xdr_stream *xdr,
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
+                               const struct nfs3_mknodargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_specdata3(xdr, args->rdev);
-        *p++ = htonl(args->type);
+}
-        p = xdr_encode_sattr(p, args->sattr);
-        if (args->type == NF3CHR || args->type == NF3BLK) {
+static void encode_mknoddata3(struct xdr_stream *xdr,
-                *p++ = htonl(MAJOR(args->rdev));
+                              const struct nfs3_mknodargs *args)
-                *p++ = htonl(MINOR(args->rdev));
+{
+        encode_ftype3(xdr, args->type);
+        switch (args->type) {
+        case NF3CHR:
+        case NF3BLK:
+                encode_devicedata3(xdr, args);
+                break;
+        case NF3SOCK:
+        case NF3FIFO:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NF3REG:
+        case NF3DIR:
+                break;
+        default:
+                BUG();
        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
-        return 0;
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mknodargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_mknoddata3(xdr, args);
 }
 /*
- * Encode RENAME arguments
+ * 3.3.12  REMOVE3args
+ *
+ *      struct REMOVE3args {
+ *              diropargs3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                     struct xdr_stream *xdr,
-{
+                                     const struct nfs_removeargs *args)
-        p = xdr_encode_fhandle(p, args->old_dir);
+{
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 3.3.14  RENAME3args
+ *
+ *      struct RENAME3args {
+ *              diropargs3      from;
+ *              diropargs3      to;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_fhandle(p, args->tofh);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs3(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode arguments to readdir call
+ * 3.3.15  LINK3args
+ *
+ *      struct LINK3args {
+ *              nfs_fh3         file;
+ *              diropargs3      link;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
+                                   struct xdr_stream *xdr,
+                                   const struct nfs3_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
-        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_hyper(p, args->cookie);
-        *p++ = args->verf[0];
-        *p++ = args->verf[1];
-        if (args->plus) {
-                /* readdirplus: need dircount + buffer size.
-                 * We just make sure we make dircount big enough */
-                *p++ = htonl(count >> 3);
-        }
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-        return 0;
 }
 /*
- * Decode the result of a readdir call.
+ * 3.3.16  READDIR3args
- * We just check for syntactical correctness.
+ *
+ *      struct READDIR3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_readdir3args(struct xdr_stream *xdr,
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+                                const struct nfs3_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        struct page **page;
-        size_t hdrlen;
-        u32 recvd, pglen;
-        int status, nr = 0;
-        status = ntohl(*p++);
-        /* Decode post_op_attrs */
-        p = xdr_decode_post_op_attr(p, res->dir_attr);
-        if (status)
-                return nfs_stat_to_errno(status);
-        /* Decode verifier cookie */
-        if (res->verf) {
-                res->verf[0] = *p++;
-                res->verf[1] = *p++;
-        } else {
-                p += 2;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        encode_nfs_fh3(xdr, args->fh);
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
-        recvd = rcvbuf->len - hdrlen;
+        p = xdr_encode_cookie3(p, args->cookie);
-        if (pglen > recvd)
+        p = xdr_encode_cookieverf3(p, args->verf);
-                pglen = recvd;
+        *p = cpu_to_be32(args->count);
-        page = rcvbuf->pages;
+}
-        return nr;
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_readdirargs *args)
+{
+        encode_readdir3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
-__be32 *
+/*
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * 3.3.17  READDIRPLUS3args
+ *
+ *      struct READDIRPLUS3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          dircount;
+ *              count3          maxcount;
+ *      };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+                                    const struct nfs3_readdirargs *args)
 {
        __be32 *p;
-        struct nfs_entry old = *entry;
-        p = xdr_inline_decode(xdr, 4);
+        encode_nfs_fh3(xdr, args->fh);
-        if (unlikely(!p))
-                goto out_overflow;
-        if (!ntohl(*p++)) {
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        p = xdr_inline_decode(xdr, 12);
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
-        if (unlikely(!p))
+        p = xdr_encode_cookie3(p, args->cookie);
-                goto out_overflow;
+        p = xdr_encode_cookieverf3(p, args->verf);
-        p = xdr_decode_hyper(p, &entry->ino);
-        entry->len  = ntohl(*p++);
-        p = xdr_inline_decode(xdr, entry->len + 8);
-        if (unlikely(!p))
-                goto out_overflow;
-        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
-        entry->prev_cookie = entry->cookie;
-        p = xdr_decode_hyper(p, &entry->cookie);
-        if (plus) {
-                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-                if (IS_ERR(p))
-                        goto out_overflow_exit;
-                /* In fact, a post_op_fh3: */
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (*p++) {
-                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
-                        if (IS_ERR(p))
-                                goto out_overflow_exit;
-                        /* Ugh -- server reply was truncated */
-                        if (p == NULL) {
-                                dprintk("NFS: FH truncated\n");
-                                *entry = old;
-                                return ERR_PTR(-EAGAIN);
-                        }
-                } else
-                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-        }
-        p = xdr_inline_peek(xdr, 8);
+        /*
-        if (p != NULL)
+         * readdirplus: need dircount + buffer size.
-                entry->eof = !p[0] && p[1];
+         * We just make sure we make dircount big enough
-        else
+         */
-                entry->eof = 0;
+        *p++ = cpu_to_be32(args->count >> 3);
-        return p;
+        *p = cpu_to_be32(args->count);
+}
-out_overflow:
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
-        print_overflow_msg(__func__, xdr);
+                                          struct xdr_stream *xdr,
-out_overflow_exit:
+                                          const struct nfs3_readdirargs *args)
-        return ERR_PTR(-EIO);
+{
+        encode_readdirplus3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Encode COMMIT arguments
+ * 3.3.21  COMMIT3args
+ *
+ *      struct COMMIT3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_commit3args(struct xdr_stream *xdr,
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                               const struct nfs_writeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
+        encode_nfs_fh3(xdr, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(args->count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
-#ifdef CONFIG_NFS_V3_ACL
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
-/*
+                                     struct xdr_stream *xdr,
- * Encode GETACL arguments
+                                     const struct nfs_writeargs *args)
- */
-static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
-                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_commit3args(xdr, args);
-        unsigned int replen;
+}
-        p = xdr_encode_fhandle(p, args->fh);
+#ifdef CONFIG_NFS_V3_ACL
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        if (args->mask & (NFS_ACL | NFS_DFACL)) {
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
-                /* Inline the page array */
+                                     struct xdr_stream *xdr,
-                replen = (RPC_REPHDRSIZE + auth->au_rslack +
+                                     const struct nfs3_getaclargs *args)
-                          ACL3_getaclres_sz) << 2;
+{
-                xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+        encode_nfs_fh3(xdr, args->fh);
-                                 NFSACL_MAXPAGES << PAGE_SHIFT);
+        encode_uint32(xdr, args->mask);
-        }
+        if (args->mask & (NFS_ACL | NFS_DFACL))
-        return 0;
+                prepare_reply_buffer(req, args->pages, 0,
+                                        NFSACL_MAXPAGES << PAGE_SHIFT,
+                                        ACL3_getaclres_sz);
 }
-/*
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
- * Encode SETACL arguments
+                                     struct xdr_stream *xdr,
- */
+                                     const struct nfs3_setaclargs *args)
-static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_setaclargs *args)
 {
-        struct xdr_buf *buf = &req->rq_snd_buf;
        unsigned int base;
-        int err;
+        int error;
-        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
+        encode_nfs_fh3(xdr, NFS_FH(args->inode));
-        *p++ = htonl(args->mask);
+        encode_uint32(xdr, args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        base = req->rq_slen;
+        base = req->rq_slen;
        if (args->npages != 0)
-                xdr_encode_pages(buf, args->pages, 0, args->len);
+                xdr_write_pages(xdr, args->pages, 0, args->len);
        else
-                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+                xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
-                                p + XDR_QUADLEN(args->len));
-        err = nfsacl_encode(buf, base, args->inode,
+        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
-        if (err > 0)
+        BUG_ON(error < 0);
-                err = nfsacl_encode(buf, base + err, args->inode,
+        error = nfsacl_encode(xdr->buf, base + error, args->inode,
-                                    (args->mask & NFS_DFACL) ?
+                            (args->mask & NFS_DFACL) ?
-                                    args->acl_default : NULL, 1,
+                            args->acl_default : NULL, 1,
-                                    NFS_ACL_DEFAULT);
+                            NFS_ACL_DEFAULT);
-        return (err > 0) ? 0 : err;
+        BUG_ON(error < 0);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 /*
- * NFS XDR decode functions
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Decode attrstat reply.
+ * 3.3.1  GETATTR3res
+ *
+ *      struct GETATTR3resok {
+ *              fattr3          obj_attributes;
+ *      };
+ *
+ *      union GETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              GETATTR3resok  resok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_fattr(p, fattr);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_fattr3(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode status+wcc_data reply
+ * 3.3.2  SETATTR3res
- * SATTR, REMOVE, RMDIR
+ *
+ *      struct SETATTR3resok {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      struct SETATTR3resfail {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      union SETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              SETATTR3resok   resok;
+ *      default:
+ *              SETATTR3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_wcc_data(p, fattr);
+        if (unlikely(error))
-        return status;
+                goto out;
+        error = decode_wcc_data(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+ * 3.3.3  LOOKUP3res
+ *
+ *      struct LOOKUP3resok {
+ *              nfs_fh3         object;
+ *              post_op_attr    obj_attributes;
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      struct LOOKUP3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union LOOKUP3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LOOKUP3resok    resok;
+ *      default:
+ *              LOOKUP3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
 {
-        return nfs3_xdr_wccstat(req, p, res->dir_attr);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfs_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode LOOKUP reply
+ * 3.3.4  ACCESS3res
+ *
+ *      struct ACCESS3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          access;
+ *      };
+ *
+ *      struct ACCESS3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union ACCESS3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              ACCESS3resok    resok;
+ *      default:
+ *              ACCESS3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_accessres *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++))) {
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        } else {
+        if (unlikely(error))
-                if (!(p = xdr_decode_fhandle(p, res->fh)))
+                goto out;
-                        return -errno_NFSERR_IO;
+        error = decode_post_op_attr(xdr, result->fattr);
-                p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        }
+                goto out;
-        xdr_decode_post_op_attr(p, res->dir_attr);
+        if (status != NFS3_OK)
-        return status;
+                goto out_default;
+        error = decode_uint32(xdr, &result->access);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode ACCESS reply
+ * 3.3.5  READLINK3res
+ *
+ *      struct READLINK3resok {
+ *              post_op_attr    symlink_attributes;
+ *              nfspath3        data;
+ *      };
+ *
+ *      struct READLINK3resfail {
+ *              post_op_attr    symlink_attributes;
+ *      };
+ *
+ *      union READLINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READLINK3resok  resok;
+ *      default:
+ *              READLINK3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+                                     struct xdr_stream *xdr,
+                                     struct nfs_fattr *result)
 {
-        int     status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status)
+        error = decode_nfsstat3(xdr, &status);
-                return nfs_stat_to_errno(status);
+        if (unlikely(error))
-        res->access = ntohl(*p++);
+                goto out;
-        return 0;
+        error = decode_post_op_attr(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfspath3(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+ * 3.3.6  READ3res
+ *
+ *      struct READ3resok {
+ *              post_op_attr    file_attributes;
+ *              count3          count;
+ *              bool            eof;
+ *              opaque          data<>;
+ *      };
+ *
+ *      struct READ3resfail {
+ *              post_op_attr    file_attributes;
+ *      };
+ *
+ *      union READ3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READ3resok      resok;
+ *      default:
+ *              READ3resfail    resfail;
+ *      };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+                             struct nfs_readres *result)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        u32 eof, count, ocount, recvd;
-        unsigned int replen;
+        size_t hdrlen;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_inline_decode(xdr, 4 + 4 + 4);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p++);
+        eof = be32_to_cpup(p++);
+        ocount = be32_to_cpup(p++);
+        if (unlikely(ocount != count))
+                goto out_mismatch;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = eof;
+        result->count = count;
+        return count;
+out_mismatch:
+        dprintk("NFS: READ count doesn't match length of opaque: "
+                "count %u != ocount %u\n", count, ocount);
+        return -EIO;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        eof = 0;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        /* Inline the page array */
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
+                                 struct nfs_readres *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+{
-        return 0;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_read3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 3.3.7  WRITE3res
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3resok {
+ *              wcc_data        file_wcc;
+ *              count3          count;
+ *              stable_how      committed;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct WRITE3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union WRITE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              WRITE3resok     resok;
+ *      default:
+ *              WRITE3resfail   resfail;
+ *      };
 */
-static int
+static int decode_write3resok(struct xdr_stream *xdr,
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                              struct nfs_writeres *result)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
-        if (iov->iov_len < hdrlen) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: READLINK reply header overflowed:"
+                goto out_overflow;
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+        result->count = be32_to_cpup(p++);
-                return -errno_NFSERR_IO;
+        result->verf->committed = be32_to_cpup(p++);
-        } else if (iov->iov_len != hdrlen) {
+        if (unlikely(result->verf->committed > NFS_FILE_SYNC))
-                dprintk("NFS: READLINK header is short. "
+                goto out_badvalue;
-                        "iovec will be shifted.\n");
+        memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
+        return result->count;
-        }
+out_badvalue:
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
-        if (recvd < len) {
+        return -EIO;
-                dprintk("NFS: server cheating in readlink reply: "
+out_overflow:
-                                "count %u > recvd %u\n", len, recvd);
+        print_overflow_msg(__func__, xdr);
-                return -EIO;
+        return -EIO;
-        }
+}
-        xdr_terminate_string(rcvbuf, len);
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        return 0;
+                                  struct nfs_writeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_write3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READ reply
+ * 3.3.8  CREATE3res
+ *
+ *      struct CREATE3resok {
+ *              post_op_fh3     obj;
+ *              post_op_attr    obj_attributes;
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      struct CREATE3resfail {
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      union CREATE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              CREATE3resok    resok;
+ *      default:
+ *              CREATE3resfail  resfail;
+ *      };
 */
-static int
+static int decode_create3resok(struct xdr_stream *xdr,
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                               struct nfs3_diropres *result)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        int error;
-        size_t hdrlen;
-        u32 count, ocount, recvd;
+        error = decode_post_op_fh3(xdr, result->fh);
-        int status;
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        /* The server isn't required to return a file handle.
+         * If it didn't, force the client to perform a LOOKUP
+         * to determine the correct file handle and attribute
+         * values for the new object. */
+        if (result->fh->size == 0)
+                result->fattr->valid = 0;
+        error = decode_wcc_data(xdr, result->dir_attr);
+out:
+        return error;
+}
-        status = ntohl(*p++);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_create3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        if (status != 0)
+/*
-                return nfs_stat_to_errno(status);
+ * 3.3.12  REMOVE3res
+ *
+ *      struct REMOVE3resok {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      struct REMOVE3resfail {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      union REMOVE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              REMOVE3resok   resok;
+ *      default:
+ *              REMOVE3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_removeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
+/*
-         * in that it puts the count both in the res struct and in the
+ * 3.3.14  RENAME3res
-         * opaque data count. */
+ *
-        count    = ntohl(*p++);
+ *      struct RENAME3resok {
-        res->eof = ntohl(*p++);
+ *              wcc_data        fromdir_wcc;
-        ocount   = ntohl(*p++);
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      struct RENAME3resfail {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      union RENAME3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              RENAME3resok   resok;
+ *      default:
+ *              RENAME3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_renameres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->old_fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->new_fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        if (ocount != count) {
+/*
-                dprintk("NFS: READ count doesn't match RPC opaque count.\n");
+ * 3.3.15  LINK3res
-                return -errno_NFSERR_IO;
+ *
-        }
+ *      struct LINK3resok {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      struct LINK3resfail {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      union LINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LINK3resok      resok;
+ *      default:
+ *              LINK3resfail    resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs3_linkres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+/**
-        if (iov->iov_len < hdrlen) {
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
-                dprintk("NFS: READ reply header overflowed:"
+ *                      the local page cache
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+ * @xdr: XDR stream where entry resides
-                return -errno_NFSERR_IO;
+ * @entry: buffer to fill in with entry data
-        } else if (iov->iov_len != hdrlen) {
+ * @plus: boolean indicating whether this should be a readdirplus entry
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
+ *
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
+ * Returns zero if successful, otherwise a negative errno value is
-        }
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *      struct entry3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              fhandle3        filehandle;
+ *              post_op_attr3   attributes;
+ *              entry3          *nextentry;
+ *      };
+ *
+ * 3.3.17  entryplus3
+ *      struct entryplus3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              post_op_attr    name_attributes;
+ *              post_op_fh3     name_handle;
+ *              entryplus3      *nextentry;
+ *      };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
+{
+        struct nfs_entry old = *entry;
+        __be32 *p;
+        int error;
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        p = xdr_inline_decode(xdr, 4);
-        if (count > recvd) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: server cheating in read reply: "
+                goto out_overflow;
-                        "count %u > recvd %u\n", count, recvd);
+        if (*p == xdr_zero) {
-                count = recvd;
+                p = xdr_inline_decode(xdr, 4);
-                res->eof = 0;
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
        }
-        if (count < res->count)
+        error = decode_fileid3(xdr, &entry->ino);
-                res->count = count;
+        if (unlikely(error))
+                return error;
-        return count;
+        error = decode_inline_filename3(xdr, &entry->name, &entry->len);
-}
+        if (unlikely(error))
+                return error;
-/*
+        entry->prev_cookie = entry->cookie;
- * Decode WRITE response
+        error = decode_cookie3(xdr, &entry->cookie);
- */
+        if (unlikely(error))
-static int
+                return error;
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-        int     status;
-        status = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
-        p = xdr_decode_wcc_data(p, res->fattr);
-        if (status != 0)
+        if (plus) {
-                return nfs_stat_to_errno(status);
+                entry->fattr->valid = 0;
+                error = decode_post_op_attr(xdr, entry->fattr);
+                if (unlikely(error))
+                        return error;
+                if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+                        entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        res->count = ntohl(*p++);
+                /* In fact, a post_op_fh3: */
-        res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
+                p = xdr_inline_decode(xdr, 4);
-        res->verf->verifier[0] = *p++;
+                if (unlikely(p == NULL))
-        res->verf->verifier[1] = *p++;
+                        goto out_overflow;
+                if (*p != xdr_zero) {
+                        error = decode_nfs_fh3(xdr, entry->fh);
+                        if (unlikely(error)) {
+                                if (error == -E2BIG)
+                                        goto out_truncated;
+                                return error;
+                        }
+                } else
+                        zero_nfs_fh3(entry->fh);
+        }
-        return res->count;
+        return 0;
-}
-/*
+out_overflow:
- * Decode a CREATE response
+        print_overflow_msg(__func__, xdr);
- */
+        return -EAGAIN;
-static int
+out_truncated:
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+        dprintk("NFS: directory entry contains invalid file handle\n");
-{
+        *entry = old;
-        int     status;
+        return -EAGAIN;
-        status = ntohl(*p++);
-        if (status == 0) {
-                if (*p++) {
-                        if (!(p = xdr_decode_fhandle(p, res->fh)))
-                                return -errno_NFSERR_IO;
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                } else {
-                        memset(res->fh, 0, sizeof(*res->fh));
-                        /* Do decode post_op_attr but set it to NULL */
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                        res->fattr->valid = 0;
-                }
-        } else {
-                status = nfs_stat_to_errno(status);
-        }
-        p = xdr_decode_wcc_data(p, res->dir_attr);
-        return status;
 }
 /*
- * Decode RENAME reply
+ * 3.3.16  READDIR3res
+ *
+ *      struct dirlist3 {
+ *              entry3          *entries;
+ *              bool            eof;
+ *      };
+ *
+ *      struct READDIR3resok {
+ *              post_op_attr    dir_attributes;
+ *              cookieverf3     cookieverf;
+ *              dirlist3        reply;
+ *      };
+ *
+ *      struct READDIR3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union READDIR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READDIR3resok   resok;
+ *      default:
+ *              READDIR3resfail resfail;
+ *      };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_dirlist3(struct xdr_stream *xdr)
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)) != 0)
+        pglen = xdr->buf->page_len;
-                status = nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_wcc_data(p, res->old_fattr);
+        recvd = xdr->buf->len - hdrlen;
-        p = xdr_decode_wcc_data(p, res->new_fattr);
+        if (unlikely(pglen > recvd))
-        return status;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int decode_readdir3resok(struct xdr_stream *xdr,
- * Decode LINK reply
+                                struct nfs3_readdirres *result)
- */
-static int
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
-        int     status;
+        int error;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        /* XXX: do we need to check if result->verf != NULL ? */
+        error = decode_cookieverf3(xdr, result->verf);
+        if (unlikely(error))
+                goto out;
+        error = decode_dirlist3(xdr);
+out:
+        return error;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
-                status = nfs_stat_to_errno(status);
+                                    struct xdr_stream *xdr,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                    struct nfs3_readdirres *result)
-        p = xdr_decode_wcc_data(p, res->dir_attr);
+{
-        return status;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_readdir3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSSTAT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *      struct FSSTAT3resok {
+ *              post_op_attr    obj_attributes;
+ *              size3           tbytes;
+ *              size3           fbytes;
+ *              size3           abytes;
+ *              size3           tfiles;
+ *              size3           ffiles;
+ *              size3           afiles;
+ *              uint32          invarsec;
+ *      };
+ *
+ *      struct FSSTAT3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSSTAT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSSTAT3resok    resok;
+ *      default:
+ *              FSSTAT3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsstat3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+                               struct nfs_fsstat *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_hyper(p, &res->tbytes);
-        p = xdr_decode_hyper(p, &res->fbytes);
-        p = xdr_decode_hyper(p, &res->abytes);
-        p = xdr_decode_hyper(p, &res->tfiles);
-        p = xdr_decode_hyper(p, &res->ffiles);
-        p = xdr_decode_hyper(p, &res->afiles);
+        p = xdr_inline_decode(xdr, 8 * 6 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_size3(p, &result->tbytes);
+        p = xdr_decode_size3(p, &result->fbytes);
+        p = xdr_decode_size3(p, &result->abytes);
+        p = xdr_decode_size3(p, &result->tfiles);
+        p = xdr_decode_size3(p, &result->ffiles);
+        xdr_decode_size3(p, &result->afiles);
        /* ignore invarsec */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsstat *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsstat3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSINFO reply
+ * 3.3.19  FSINFO3res
+ *
+ *      struct FSINFO3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          rtmax;
+ *              uint32          rtpref;
+ *              uint32          rtmult;
+ *              uint32          wtmax;
+ *              uint32          wtpref;
+ *              uint32          wtmult;
+ *              uint32          dtpref;
+ *              size3           maxfilesize;
+ *              nfstime3        time_delta;
+ *              uint32          properties;
+ *      };
+ *
+ *      struct FSINFO3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSINFO3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSINFO3resok    resok;
+ *      default:
+ *              FSINFO3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+                               struct nfs_fsinfo *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->rtmax  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
-        res->rtpref = ntohl(*p++);
+        if (unlikely(p == NULL))
-        res->rtmult = ntohl(*p++);
+                goto out_overflow;
-        res->wtmax  = ntohl(*p++);
+        result->rtmax  = be32_to_cpup(p++);
-        res->wtpref = ntohl(*p++);
+        result->rtpref = be32_to_cpup(p++);
-        res->wtmult = ntohl(*p++);
+        result->rtmult = be32_to_cpup(p++);
-        res->dtpref = ntohl(*p++);
+        result->wtmax  = be32_to_cpup(p++);
-        p = xdr_decode_hyper(p, &res->maxfilesize);
+        result->wtpref = be32_to_cpup(p++);
-        p = xdr_decode_time3(p, &res->time_delta);
+        result->wtmult = be32_to_cpup(p++);
+        result->dtpref = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &result->maxfilesize);
+        xdr_decode_nfstime3(p, &result->time_delta);
        /* ignore properties */
-        res->lease_time = 0;
+        result->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsinfo3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode PATHCONF reply
+ * 3.3.20  PATHCONF3res
+ *
+ *      struct PATHCONF3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          linkmax;
+ *              uint32          name_max;
+ *              bool            no_trunc;
+ *              bool            chown_restricted;
+ *              bool            case_insensitive;
+ *              bool            case_preserving;
+ *      };
+ *
+ *      struct PATHCONF3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union PATHCONF3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              PATHCONF3resok  resok;
+ *      default:
+ *              PATHCONF3resfail resfail;
+ *      };
 */
-static int
+static int decode_pathconf3resok(struct xdr_stream *xdr,
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
+                                 struct nfs_pathconf *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->max_link = ntohl(*p++);
-        res->max_namelen = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 6);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->max_link = be32_to_cpup(p++);
+        result->max_namelen = be32_to_cpup(p);
        /* ignore remaining fields */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_pathconf *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_pathconf3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode COMMIT reply
+ * 3.3.21  COMMIT3res
+ *
+ *      struct COMMIT3resok {
+ *              wcc_data        file_wcc;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct COMMIT3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union COMMIT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              COMMIT3resok    resok;
+ *      default:
+ *              COMMIT3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs_writeres *result)
 {
-        int             status;
+        enum nfs_stat status;
+        int error;
-        status = ntohl(*p++);
-        p = xdr_decode_wcc_data(p, res->fattr);
+        error = decode_nfsstat3(xdr, &status);
-        if (status != 0)
+        if (unlikely(error))
-                return nfs_stat_to_errno(status);
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
-        res->verf->verifier[0] = *p++;
+        if (unlikely(error))
-        res->verf->verifier[1] = *p++;
+                goto out;
-        return 0;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 #ifdef CONFIG_NFS_V3_ACL
-/*
- * Decode GETACL reply
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
- */
+                                      struct nfs3_getaclres *result)
-static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_getaclres *res)
 {
-        struct xdr_buf *buf = &req->rq_rcv_buf;
-        int status = ntohl(*p++);
        struct posix_acl **acl;
        unsigned int *aclcnt;
-        int err, base;
+        size_t hdrlen;
+        int error;
-        if (status != 0)
-                return nfs_stat_to_errno(status);
+        error = decode_post_op_attr(xdr, result->fattr);
-        p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        res->mask = ntohl(*p++);
+                goto out;
-        if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+        error = decode_uint32(xdr, &result->mask);
-                return -EINVAL;
+        if (unlikely(error))
-        base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
+                goto out;
+        error = -EINVAL;
-        acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+        if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-        aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+                goto out;
-        err = nfsacl_decode(buf, base, aclcnt, acl);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
-        aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+        acl = NULL;
-        if (err > 0)
+        if (result->mask & NFS_ACL)
-                err = nfsacl_decode(buf, base + err, aclcnt, acl);
+                acl = &result->acl_access;
-        return (err > 0) ? 0 : err;
+        aclcnt = NULL;
+        if (result->mask & NFS_ACLCNT)
+                aclcnt = &result->acl_access_count;
+        error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+        if (unlikely(error <= 0))
+                goto out;
+        acl = NULL;
+        if (result->mask & NFS_DFACL)
+                acl = &result->acl_default;
+        aclcnt = NULL;
+        if (result->mask & NFS_DFACLCNT)
+                aclcnt = &result->acl_default_count;
+        error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+        if (unlikely(error <= 0))
+                return error;
+        error = 0;
+out:
+        return error;
 }
-/*
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
- * Decode setacl reply.
+                                   struct xdr_stream *xdr,
- */
+                                   struct nfs3_getaclres *result)
-static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
-        int status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_getacl3resok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if (status)
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
-                return nfs_stat_to_errno(status);
+                                   struct xdr_stream *xdr,
-        xdr_decode_post_op_attr(p, fattr);
+                                   struct nfs_fattr *result)
-        return 0;
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_post_op_attr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
-        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
+        .p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,      \
-        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
+        .p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,       \
-        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_arglen    = NFS3_##argtype##args_sz,                         \
-        .p_replen    = NFS3_##restype##_sz,                             \
+        .p_replen    = NFS3_##restype##res_sz,                          \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
        }
 struct rpc_procinfo     nfs3_procedures[] = {
-  PROC(GETATTR,         fhandle,        attrstat, 1),
+        PROC(GETATTR,           getattr,        getattr,        1),
-  PROC(SETATTR,         sattrargs,      wccstat, 0),
+        PROC(SETATTR,           setattr,        setattr,        0),
-  PROC(LOOKUP,          diropargs,      lookupres, 2),
+        PROC(LOOKUP,            lookup,         lookup,         2),
-  PROC(ACCESS,          accessargs,     accessres, 1),
+        PROC(ACCESS,            access,         access,         1),
-  PROC(READLINK,        readlinkargs,   readlinkres, 3),
+        PROC(READLINK,          readlink,       readlink,       3),
-  PROC(READ,            readargs,       readres, 3),
+        PROC(READ,              read,           read,           3),
-  PROC(WRITE,           writeargs,      writeres, 4),
+        PROC(WRITE,             write,          write,          4),
-  PROC(CREATE,          createargs,     createres, 0),
+        PROC(CREATE,            create,         create,         0),
-  PROC(MKDIR,           mkdirargs,      createres, 0),
+        PROC(MKDIR,             mkdir,          create,         0),
-  PROC(SYMLINK,         symlinkargs,    createres, 0),
+        PROC(SYMLINK,           symlink,        create,         0),
-  PROC(MKNOD,           mknodargs,      createres, 0),
+        PROC(MKNOD,             mknod,          create,         0),
-  PROC(REMOVE,          removeargs,     removeres, 0),
+        PROC(REMOVE,            remove,         remove,         0),
-  PROC(RMDIR,           diropargs,      wccstat, 0),
+        PROC(RMDIR,             lookup,         setattr,        0),
-  PROC(RENAME,          renameargs,     renameres, 0),
+        PROC(RENAME,            rename,         rename,         0),
-  PROC(LINK,            linkargs,       linkres, 0),
+        PROC(LINK,              link,           link,           0),
-  PROC(READDIR,         readdirargs,    readdirres, 3),
+        PROC(READDIR,           readdir,        readdir,        3),
-  PROC(READDIRPLUS,     readdirargs,    readdirres, 3),
+        PROC(READDIRPLUS,       readdirplus,    readdir,        3),
-  PROC(FSSTAT,          fhandle,        fsstatres, 0),
+        PROC(FSSTAT,            getattr,        fsstat,         0),
-  PROC(FSINFO,          fhandle,        fsinfores, 0),
+        PROC(FSINFO,            getattr,        fsinfo,         0),
-  PROC(PATHCONF,        fhandle,        pathconfres, 0),
+        PROC(PATHCONF,          getattr,        pathconf,       0),
-  PROC(COMMIT,          commitargs,     commitres, 5),
+        PROC(COMMIT,            commit,         commit,         5),
 };
 struct rpc_version              nfs_version3 = {
@@ -1183,8 +2471,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo      nfs3_acl_procedures[] = {
        [ACLPROC3_GETACL] = {
                .p_proc = ACLPROC3_GETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
                .p_arglen = ACL3_getaclargs_sz,
                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
@@ -1192,8 +2480,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
        [ACLPROC3_SETACL] = {
                .p_proc = ACLPROC3_SETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
                .p_arglen = ACL3_setaclargs_sz,
                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fd..7a747407314 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
-        struct rb_node       so_client_node;
+        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d65..23f930caf1e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 {
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
-        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
        dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        /* find and reference the deviceid */
        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
        if (dsaddr == NULL) {
-                dsaddr = get_device_info(lo->inode, id);
+                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
                        goto out;
        }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55..f5c9b125e8c 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        /* ipv6 length plus port is legal */
        if (rlen > INET6_ADDRSTRLEN + 8) {
-                dprintk("%s Invalid address, length %d\n", __func__,
+                dprintk("%s: Invalid address, length %d\n", __func__,
                        rlen);
                goto out_err;
        }
@@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        /* replace the port dots with dashes for the in4_pton() delimiter*/
        for (i = 0; i < 2; i++) {
                char *res = strrchr(buf, '.');
+                if (!res) {
+                        dprintk("%s: Failed finding expected dots in port\n",
+                                __func__);
+                        goto out_free;
+                }
                *res = '-';
        }
@@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        port = htons((tmp[0] << 8) | (tmp[1]));
        ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-        dprintk("%s Decoded address and port %s\n", __func__, buf);
+        dprintk("%s: Decoded address and port %s\n", __func__, buf);
 out_free:
        kfree(buf);
 out_err:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb1..78936a8f40a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,8 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -355,9 +357,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
 */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
@@ -371,8 +373,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        if (ses->fc_slot_table.highest_used_slotid != -1)
                return;
-        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->complete);
+        complete(&ses->fc_slot_table.complete);
+}
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+            ses->bc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+        complete(&ses->bc_slot_table.complete);
 }
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +403,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slot);
-        nfs41_check_drain_session_complete(res->sr_session);
+        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
 }
@@ -1826,6 +1840,8 @@ struct nfs4_closedata {
        struct nfs_closeres res;
        struct nfs_fattr fattr;
        unsigned long timestamp;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_free_closedata(void *data)
@@ -1833,6 +1849,8 @@ static void nfs4_free_closedata(void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state_owner *sp = calldata->state->owner;
+        if (calldata->roc)
+                pnfs_roc_release(calldata->state->inode);
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
@@ -1865,6 +1883,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
+                        if (calldata->roc)
+                                pnfs_roc_set_barrier(state->inode,
+                                                     calldata->roc_barrier);
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
                        nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1938,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                return;
        }
-        if (calldata->arg.fmode == 0)
+        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+                if (calldata->roc &&
+                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+                                     task, NULL);
+                        return;
+                }
+        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -1946,7 +1974,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1981,11 +2009,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->roc = roc;
        path_get(path);
        calldata->path = *path;
-        msg.rpc_argp = &calldata->arg,
+        msg.rpc_argp = &calldata->arg;
-        msg.rpc_resp = &calldata->res,
+        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -1998,6 +2027,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
        kfree(calldata);
 out:
+        if (roc)
+                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
@@ -2486,6 +2517,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                path = &ctx->path;
                fmode = ctx->mode;
        }
+        sattr->ia_mode &= ~current_umask();
        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
@@ -2816,6 +2848,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2852,8 +2886,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
-        if (status == 0)
+        if (status >= 0) {
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+                status += args.pgbase;
+        }
        nfs_invalidate_atime(dir);
@@ -2914,6 +2950,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3359,6 +3397,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -3387,6 +3427,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Acl update can result in inode attribute update.
+         * so mark the attribute cache invalid.
+         */
+        spin_lock(&inode->i_lock);
+        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+        spin_unlock(&inode->i_lock);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
@@ -3467,6 +3514,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
+                .sc_cb_ident = clp->cl_cb_ident,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3506,7 +3554,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                if (signalled())
                        break;
                if (loop++ & 1)
-                        ssleep(clp->cl_lease_time + 1);
+                        ssleep(clp->cl_lease_time / HZ + 1);
                else
                        if (++clp->cl_id_uniquifier == 0)
                                break;
@@ -3652,8 +3700,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        data->rpc_status = 0;
        task_setup_data.callback_data = data;
-        msg.rpc_argp = &data->args,
+        msg.rpc_argp = &data->args;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -3732,6 +3780,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
@@ -3897,8 +3946,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        return rpc_run_task(&task_setup_data);
 }
@@ -3977,6 +4026,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
        p->lsp = lsp;
        p->server = server;
@@ -4134,8 +4184,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -4381,48 +4431,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
                return;
        args->lock_owner.clientid = server->nfs_client->cl_clientid;
        args->lock_owner.id = lsp->ls_id.id;
+        args->lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = args;
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
-                size_t buflen, int flags)
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_set_acl(inode, buf, buflen);
+        return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- * and that's what we'll do for e.g. user attributes that haven't been set.
+                                   void *buf, size_t buflen, int type)
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-                size_t buflen)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_get_acl(inode, buf, buflen);
+        return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
 {
-        size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+        size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
        if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
                return 0;
-        if (buf && buflen < len)
-                return -ERANGE;
+        if (list && len <= list_len)
-        if (buf)
+                memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-                memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
        return len;
 }
@@ -4475,6 +4520,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+        if (flags & ~EXCHGID4_FLAG_MASK_R)
+                goto out_inval;
+        if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+            (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+                goto out_inval;
+        if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+                goto out_inval;
+        return NFS_OK;
+out_inval:
+        return -NFS4ERR_INVAL;
+}
+/*
 * nfs4_proc_exchange_id()
 *
 * Since the clientid has expired, all compounds using sessions
@@ -4487,7 +4551,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
                .client = clp,
-                .flags = clp->cl_exchange_flags,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
        struct nfs41_exchange_id_res res = {
                .client = clp,
@@ -4504,34 +4568,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        /* Remove server-only flags */
-        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
        args.verifier = &verifier;
-        while (1) {
+        args.id_len = scnprintf(args.id, sizeof(args.id),
-                args.id_len = scnprintf(args.id, sizeof(args.id),
+                                "%s/%s.%s/%u",
-                                        "%s/%s %u",
+                                clp->cl_ipaddr,
-                                        clp->cl_ipaddr,
+                                init_utsname()->nodename,
-                                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                init_utsname()->domainname,
-                                                         RPC_DISPLAY_ADDR),
+                                clp->cl_rpcclient->cl_auth->au_flavor);
-                                        clp->cl_id_uniquifier);
-                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-                if (status != -NFS4ERR_CLID_INUSE)
-                        break;
-                if (signalled())
-                        break;
-                if (++clp->cl_id_uniquifier == 0)
-                        break;
-        }
+        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+        if (!status)
+                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4765,17 +4816,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
        session->session_state = 1<<NFS4_SESSION_INITING;
@@ -5269,13 +5320,23 @@ static void
 nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
-        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-        struct nfs_server *server = NFS_SERVER(ino);
        dprintk("--> %s\n", __func__);
+        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+         * right now covering the LAYOUTGET we are about to send.
+         * However, that is not so catastrophic, and there seems
+         * to be no way to prevent it completely.
+         */
        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
                                &lgp->res.seq_res, 0, task))
                return;
+        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+                                          NFS_I(lgp->args.inode)->layout,
+                                          lgp->args.ctx->state)) {
+                rpc_exit(task, NFS4_OK);
+                return;
+        }
        rpc_call_start(task);
 }
@@ -5302,7 +5363,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                        return;
                }
        }
-        lgp->status = task->tk_status;
        dprintk("<-- %s\n", __func__);
 }
@@ -5311,7 +5371,6 @@ static void nfs4_layoutget_release(void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(lgp->args.inode);
        if (lgp->res.layout.buf != NULL)
                free_page((unsigned long) lgp->res.layout.buf);
        put_nfs_open_context(lgp->args.ctx);
@@ -5356,13 +5415,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
+        if (status == 0)
-                goto out;
+                status = task->tk_status;
-        status = lgp->status;
+        if (status == 0)
-        if (status != 0)
+                status = pnfs_layout_process(lgp);
-                goto out;
-        status = pnfs_layout_process(lgp);
-out:
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
@@ -5493,9 +5549,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5540,6 +5597,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .open_context   = nfs4_atomic_open,
 };
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+        .prefix = XATTR_NAME_NFSV4_ACL,
+        .list   = nfs4_xattr_list_nfs4_acl,
+        .get    = nfs4_xattr_get_nfs4_acl,
+        .set    = nfs4_xattr_set_nfs4_acl,
+};
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+        &nfs4_xattr_nfs4_acl_handler,
+        NULL
+};
 /*
 * Local variables:
 *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af1..402143d75fc 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        /* Are there any active superblocks? */
-        if (list_empty(&clp->cl_superblocks))
+        rcu_read_lock();
+        if (list_empty(&clp->cl_superblocks)) {
+                rcu_read_unlock();
                goto out;
+        }
+        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (list_empty(&clp->cl_delegations)) {
+                        if (!nfs_delegations_present(clp)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a312673..e6742b57a04 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct rpc_cred *cred = NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                if (list_empty(&sp->so_states))
                        continue;
                cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+        struct rpc_cred *cred = NULL;
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                cred = nfs4_get_renew_cred_server_locked(server);
+                if (cred != NULL)
+                        break;
+        }
+        rcu_read_unlock();
+        return cred;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
-                INIT_COMPLETION(ses->complete);
+                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
-                return wait_for_completion_interruptible(&ses->complete);
+                return wait_for_completion_interruptible(&tbl->complete);
        }
        spin_unlock(&tbl->slot_tbl_lock);
        return 0;
 }
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int ret = 0;
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+        /* back channel */
+        ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+        if (ret)
+                return ret;
+        /* fore channel */
+        return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
@@ -210,28 +250,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4_1 */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
+        spin_lock(&clp->cl_lock);
+        pos = rb_first(&server->state_owners);
+        if (pos != NULL) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+                cred = get_rpccred(sp->so_cred);
+        }
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+        struct nfs_server *server;
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
        cred = nfs4_get_machine_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred != NULL)
                goto out;
-        pos = rb_first(&clp->cl_state_owners);
-        if (pos != NULL) {
+        rcu_read_lock();
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                cred = get_rpccred(sp->so_cred);
+                cred = nfs4_get_setclientid_cred_server(server);
+                if (cred != NULL)
+                        break;
        }
+        rcu_read_unlock();
 out:
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                __u64 minval, int maxbits)
+                                       struct nfs_unique_id *new,
+                                       __u64 minval, int maxbits)
 {
        struct rb_node **p, *parent;
        struct nfs_unique_id *pos;
@@ -286,16 +354,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs_client *clp = server->nfs_client;
+        struct rb_node **p = &server->state_owners.rb_node,
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp, *res = NULL;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
                if (server < sp->so_server) {
                        p = &parent->rb_left;
@@ -319,24 +386,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
+        struct nfs_server *server = new->so_server;
+        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (new->so_server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (new->so_server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (new->so_cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (new->so_cred > sp->so_cred)
@@ -346,18 +406,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->openowner_id,
-        rb_link_node(&new->so_client_node, parent, p);
+                                        &new->so_owner_id, 1, 64);
-        rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+        rb_link_node(&new->so_server_node, parent, p);
+        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
 }
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node))
+        struct nfs_server *server = sp->so_server;
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-        nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+        if (!RB_EMPTY_NODE(&sp->so_server_node))
+                rb_erase(&sp->so_server_node, &server->state_owners);
+        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 /*
@@ -386,23 +449,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-                struct nfs_client *clp = sp->so_server->nfs_client;
+                struct nfs_server *server = sp->so_server;
+                struct nfs_client *clp = server->nfs_client;
                spin_lock(&clp->cl_lock);
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+                rb_erase(&sp->so_server_node, &server->state_owners);
-                RB_CLEAR_NODE(&sp->so_client_node);
+                RB_CLEAR_NODE(&sp->so_server_node);
                spin_unlock(&clp->cl_lock);
        }
 }
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+                                              struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_find_state_owner(server, cred);
+        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                return sp;
@@ -412,7 +484,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_insert_state_owner(clp, new);
+        sp = nfs4_insert_state_owner_locked(new);
        spin_unlock(&clp->cl_lock);
        if (sp == new)
                get_rpccred(cred);
@@ -423,6 +495,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +507,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner(clp, sp);
+        nfs4_remove_state_owner_locked(sp);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&sp->so_sequence.wait);
        put_rpccred(cred);
@@ -585,8 +662,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else
+        } else {
-                nfs4_do_close(path, state, gfp_mask, wait);
+                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(path, state, gfp_mask, wait, roc);
+        }
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +713,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -657,7 +738,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                return NULL;
        }
        spin_lock(&clp->cl_lock);
-        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
@@ -665,10 +746,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
@@ -1114,15 +1196,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        }
 }
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        /* Reset all sequence ids to zero */
+        spin_lock(&clp->cl_lock);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1217,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_reset_seqids(server, mark_reclaim);
+        rcu_read_unlock();
 }
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1246,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        spin_lock(&clp->cl_lock);
-                return 0;
+        for (pos = rb_first(&server->state_owners);
+             pos != NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+             pos = rb_next(pos)) {
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+                                                &state->flags))
                                continue;
                        nfs4_state_mark_reclaim_nograce(clp, state);
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_clear_reclaim_server(server);
+        rcu_read_unlock();
        nfs_delegation_reap_unclaimed(clp);
        return 1;
@@ -1238,27 +1352,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+        struct nfs4_state_owner *sp;
+        struct nfs_server *server;
        struct rb_node *pos;
        int status = 0;
 restart:
-        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&clp->cl_lock);
-                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                for (pos = rb_first(&server->state_owners);
-                        continue;
+                     pos != NULL;
-                atomic_inc(&sp->so_count);
+                     pos = rb_next(pos)) {
-                spin_unlock(&clp->cl_lock);
+                        sp = rb_entry(pos,
-                status = nfs4_reclaim_open_state(sp, ops);
+                                struct nfs4_state_owner, so_server_node);
-                if (status < 0) {
+                        if (!test_and_clear_bit(ops->owner_flag_bit,
-                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                                        &sp->so_flags))
+                                continue;
+                        atomic_inc(&sp->so_count);
+                        spin_unlock(&clp->cl_lock);
+                        rcu_read_unlock();
+                        status = nfs4_reclaim_open_state(sp, ops);
+                        if (status < 0) {
+                                set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                nfs4_put_state_owner(sp);
+                                return nfs4_recovery_handle_error(clp, status);
+                        }
                        nfs4_put_state_owner(sp);
-                        return nfs4_recovery_handle_error(clp, status);
+                        goto restart;
                }
-                nfs4_put_state_owner(sp);
+                spin_unlock(&clp->cl_lock);
-                goto restart;
        }
-        spin_unlock(&clp->cl_lock);
+        rcu_read_unlock();
        return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e..4e2c168b6ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
-#define open_owner_id_maxsz     (1 + 4)
+#define open_owner_id_maxsz     (1 + 1 + 4)
-#define lock_owner_id_maxsz     (1 + 4)
+#define lock_owner_id_maxsz     (1 + 1 + 4)
 #define decode_lockowner_maxsz  (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
        __be32 *p;
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, lowner->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        *p++ = cpu_to_be32(lowner->s_dev);
        xdr_encode_hyper(p, lowner->id);
 }
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        *p++ = cpu_to_be32(OP_OPEN);
        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "open id:", 8);
+        *p++ = cpu_to_be32(arg->server->s_dev);
        xdr_encode_hyper(p, arg->id);
 }
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_restorefh_maxsz;
 }
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        if (arg->acl_len % 4)
+        BUG_ON(arg->acl_len % 4);
-                return -EINVAL;
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
-        return 0;
 }
 static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
                      const struct nfs4_layoutget_args *args,
                      struct compound_hdr *hdr)
 {
-        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->range.offset);
        p = xdr_encode_hyper(p, args->range.length);
        p = xdr_encode_hyper(p, args->minlength);
-        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
-                                args->ctx->state);
-        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
        *p = cpu_to_be32(args->maxcount);
        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_accessargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_access(xdr, args->access, &hdr);
-        encode_access(&xdr, args->access, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_lookup_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_lookup_root_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_removeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_remove(xdr, &args->name, &hdr);
-        encode_remove(&xdr, &args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_renameargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->old_dir, &hdr);
-        encode_putfh(&xdr, args->old_dir, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->new_dir, &hdr);
-        encode_putfh(&xdr, args->new_dir, &hdr);
+        encode_rename(xdr, args->old_name, args->new_name, &hdr);
-        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             const struct nfs4_link_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_link(xdr, args->name, &hdr);
-        encode_link(&xdr, args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_create_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_create(xdr, args, &hdr);
-        encode_create(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_create_arg *args)
 {
-        return nfs4_xdr_enc_create(req, p, args);
+        nfs4_xdr_enc_create(req, xdr, args);
 }
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_getattr_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_close(xdr, args, &hdr);
-        encode_close(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs_open_confirmargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_confirm(xdr, args, &hdr);
-        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_downgrade(xdr, args, &hdr);
-        encode_open_downgrade(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_lock_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lock(xdr, args, &hdr);
-        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_lockt_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lockt(xdr, args, &hdr);
-        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_locku_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_locku(xdr, args, &hdr);
-        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+                                           struct xdr_stream *xdr,
+                                        struct nfs_release_lockowner_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(xdr, &args->lock_owner, &hdr);
-        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_readlink *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readlink(xdr, args, req, &hdr);
-        encode_readlink(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_readdir_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readdir(xdr, args, req, &hdr);
-        encode_readdir(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                         args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_readargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_read(xdr, args, &hdr);
-        encode_read(&xdr, args, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setattr(xdr, args, args->server, &hdr);
-        encode_setattr(&xdr, args, args->server, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a GETACL request
 */
-static int
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
+                                struct nfs_getaclargs *args)
-                struct nfs_getaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_write(xdr, args, &hdr);
-        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_commit(xdr, args, &hdr);
-        encode_commit(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs4_fsinfo_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_fsinfo(xdr, args->bitmask, &hdr);
-        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_pathconf_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_statfs_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
-                                    struct nfs4_server_caps_arg *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_server_caps_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_client *clp)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_renew(xdr, clp, &hdr);
-        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_setclientid *sc)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid(xdr, sc, &hdr);
-        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+                                             struct xdr_stream *xdr,
+                                             struct nfs4_setclientid_res *arg)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid_confirm(xdr, arg, &hdr);
-        encode_setclientid_confirm(&xdr, arg, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_delegreturnargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_delegreturn(xdr, args->stateid, &hdr);
-        encode_delegreturn(&xdr, args->stateid, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_fs_locations_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
        replen = hdr.replen;    /* get the attribute into args->page */
-        encode_fs_locations(&xdr, args->bitmask, &hdr);
+        encode_fs_locations(xdr, args->bitmask, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
-        return 0;
 }
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
 */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
-                                    struct nfs41_exchange_id_args *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_exchange_id_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(xdr, args, &hdr);
-        encode_exchange_id(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a CREATE_SESSION request
 */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
-                                       struct nfs41_create_session_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_create_session_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(xdr, args, &hdr);
-        encode_create_session(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a DESTROY_SESSION request
 */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
-                                        struct nfs4_session *session)
+                                         struct xdr_stream *xdr,
+                                         struct nfs4_session *session)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = session->clp->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(xdr, session, &hdr);
-        encode_destroy_session(&xdr, session, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SEQUENCE request
 */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
-                                 struct nfs4_sequence_args *args)
+                                  struct nfs4_sequence_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, args, &hdr);
-        encode_sequence(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a GET_LEASE_TIME request
 */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
-                                       struct nfs4_get_lease_time_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_get_lease_time_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->la_seq_args, &hdr);
-        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RECLAIM_COMPLETE request
 */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
-                                     struct nfs41_reclaim_complete_args *args)
+                                          struct xdr_stream *xdr,
+                                struct nfs41_reclaim_complete_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(xdr, args, &hdr);
-        encode_reclaim_complete(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode GETDEVICEINFO request
 */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
-                                      struct nfs4_getdeviceinfo_args *args)
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdeviceinfo_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(xdr, args, &hdr);
-        encode_getdeviceinfo(&xdr, args, &hdr);
        /* set up reply kvec. Subtract notification bitmap max size (2)
         * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
                         args->pdev->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  Encode LAYOUTGET request
 */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
-                                  struct nfs4_layoutget_args *args)
+                                   struct xdr_stream *xdr,
+                                   struct nfs4_layoutget_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(xdr, args, &hdr);
-        encode_layoutget(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
                goto out_overflow;
        eof = be32_to_cpup(p++);
        count = be32_to_cpup(p);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
@@ -4518,7 +4440,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        xdr_read_pages(xdr, pglen);
-        return 0;
+        return pglen;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                goto out_overflow;
        len = be32_to_cpup(p);
        if (len) {
-                int i;
+                uint32_t i;
                p = xdr_inline_decode(xdr, 4 * len);
                if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5118,26 +5040,25 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_accessres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(&xdr, res);
+        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5146,26 +5067,28 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server
                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5174,23 +5097,25 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putrootfh(&xdr)) != 0)
+        status = decode_putrootfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) == 0)
+        status = decode_getfh(xdr, res->fh);
-                status = decode_getfattr(&xdr, res->fattr, res->server,
+        if (status == 0)
+                status = decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5199,24 +5124,25 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_removeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+        status = decode_remove(xdr, &res->cinfo);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5225,34 +5151,38 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_renameres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+        status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5261,37 +5191,41 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs4_link_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+        status = decode_link(xdr, &res->cinfo);
+        if (status)
                goto out;
        /*
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5300,33 +5234,37 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_create_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+        status = decode_create(xdr, &res->dir_cinfo);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5335,31 +5273,31 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_create_res *res)
 {
-        return nfs4_xdr_dec_create(rqstp, p, res);
+        return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_getattr_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5368,46 +5306,40 @@ out:
 /*
 * Encode an SETACL request
 */
-static int
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+                                struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setacl(xdr, args, &hdr);
-        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return status;
 }
 /*
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_setaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
 out:
        return status;
 }
@@ -5416,24 +5348,22 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_getaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -5442,23 +5372,22 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5476,36 +5405,35 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        if (decode_getfh(&xdr, &res->fh) != 0)
+        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if (decode_restorefh(&xdr) != 0)
+        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5514,20 +5442,20 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(xdr, res);
 out:
        return status;
 }
@@ -5535,26 +5463,26 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5563,26 +5491,26 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5591,23 +5519,22 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_lock_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lock(&xdr, res);
+        status = decode_lock(xdr, res);
 out:
        return status;
 }
@@ -5615,23 +5542,22 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_lockt_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lockt(&xdr, res);
+        status = decode_lockt(xdr, res);
 out:
        return status;
 }
@@ -5639,61 +5565,58 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_locku_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_locku(&xdr, res);
+        status = decode_locku(xdr, res);
 out:
        return status;
 }
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+                                          struct xdr_stream *xdr, void *dummy)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_release_lockowner(&xdr);
+                status = decode_release_lockowner(xdr);
        return status;
 }
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_readlink_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readlink(&xdr, rqstp);
+        status = decode_readlink(xdr, rqstp);
 out:
        return status;
 }
@@ -5701,23 +5624,22 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_readdir_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readdir(&xdr, rqstp, res);
+        status = decode_readdir(xdr, rqstp, res);
 out:
        return status;
 }
@@ -5725,23 +5647,22 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_readres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_read(&xdr, rqstp, res);
+        status = decode_read(xdr, rqstp, res);
        if (!status)
                status = res->count;
 out:
@@ -5751,26 +5672,25 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_write(&xdr, res);
+        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
@@ -5781,26 +5701,25 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_commit(&xdr, res);
+        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5809,85 +5728,80 @@ out:
 /*
 * Decode FSINFO response
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_fsinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->fsinfo);
+                status = decode_fsinfo(xdr, res->fsinfo);
        return status;
 }
 /*
 * Decode PATHCONF response
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs4_pathconf_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_pathconf(&xdr, res->pathconf);
+                status = decode_pathconf(xdr, res->pathconf);
        return status;
 }
 /*
 * Decode STATFS response
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_statfs_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_statfs(&xdr, res->fsstat);
+                status = decode_statfs(xdr, res->fsstat);
        return status;
 }
 /*
 * Decode GETATTR_BITMAP response
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_server_caps_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        status = decode_server_caps(&xdr, res);
+        status = decode_server_caps(xdr, res);
 out:
        return status;
 }
@@ -5895,79 +5809,77 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              void *__unused)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_renew(&xdr);
+                status = decode_renew(xdr);
        return status;
 }
 /*
 * Decode SETCLIENTID response
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
-                struct nfs4_setclientid_res *res)
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_setclientid_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, res);
+                status = decode_setclientid(xdr, res);
        return status;
 }
 /*
 * Decode SETCLIENTID_CONFIRM response
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+                                            struct xdr_stream *xdr,
+                                            struct nfs_fsinfo *fsinfo)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid_confirm(&xdr);
+                status = decode_setclientid_confirm(xdr);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(xdr, fsinfo);
        return status;
 }
 /*
 * Decode DELEGRETURN response
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_delegreturnres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_delegreturn(&xdr);
+        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5976,26 +5888,27 @@ out:
 /*
 * Decode FS_LOCATIONS response
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
                                     struct nfs4_fs_locations_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        xdr_enter_page(&xdr, PAGE_SIZE);
+        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+        status = decode_getfattr(xdr, &res->fs_locations->fattr,
                                 res->fs_locations->server,
                                 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -6006,129 +5919,122 @@ out:
 /*
 * Decode EXCHANGE_ID response
 */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
                                    void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_exchange_id(&xdr, res);
+                status = decode_exchange_id(xdr, res);
        return status;
 }
 /*
 * Decode CREATE_SESSION response
 */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs41_create_session_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_create_session(&xdr, res);
+                status = decode_create_session(xdr, res);
        return status;
 }
 /*
 * Decode DESTROY_SESSION response
 */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
-                                        void *dummy)
+                                        struct xdr_stream *xdr,
+                                        void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_destroy_session(&xdr, dummy);
+                status = decode_destroy_session(xdr, res);
        return status;
 }
 /*
 * Decode SEQUENCE response
 */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_sequence_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, res, rqstp);
+                status = decode_sequence(xdr, res, rqstp);
        return status;
 }
 /*
 * Decode GET_LEASE_TIME response
 */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs4_get_lease_time_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+                status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+                status = decode_fsinfo(xdr, res->lr_fsinfo);
        return status;
 }
 /*
 * Decode RECLAIM_COMPLETE response
 */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+                                         struct xdr_stream *xdr,
                                         struct nfs41_reclaim_complete_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(&xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, (void *)NULL);
        return status;
 }
 /*
 * Decode GETDEVINFO response
 */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
                                      struct nfs4_getdeviceinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status != 0)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status != 0)
                goto out;
-        status = decode_getdeviceinfo(&xdr, res->pdev);
+        status = decode_getdeviceinfo(xdr, res->pdev);
 out:
        return status;
 }
@@ -6136,45 +6042,58 @@ out:
 /*
 * Decode LAYOUTGET response
 */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
                                  struct nfs4_layoutget_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_layoutget(&xdr, rqstp, res);
+        status = decode_layoutget(xdr, rqstp, res);
 out:
        return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+/**
-                           struct nfs_server *server, int plus)
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
        __be32 *p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
                if (unlikely(!p))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p == xdr_zero)
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
        p = xdr_inline_decode(xdr, 12);
@@ -6182,7 +6101,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->len = ntohl(*p++);
+        entry->len = be32_to_cpup(p);
        p = xdr_inline_decode(xdr, entry->len);
        if (unlikely(!p))
@@ -6203,25 +6122,21 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (decode_attr_length(xdr, &len, &p) < 0)
                goto out_overflow;
-        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+                                        entry->server, 1) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
-        if (verify_attr_len(xdr, p, len) < 0)
+        entry->d_type = DT_UNKNOWN;
-                goto out_overflow;
+        if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        p = xdr_inline_peek(xdr, 8);
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
+        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EAGAIN;
 }
 /*
@@ -6297,8 +6212,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
+        .p_encode = (kxdreproc_t)nfs4_xdr_##argtype,            \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
+        .p_decode = (kxdrdproc_t)nfs4_xdr_##restype,            \
        .p_arglen = NFS4_##argtype##_sz,                        \
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
@@ -6306,50 +6221,50 @@ nfs4_stat_to_errno(int stat)
 }
 struct rpc_procinfo     nfs4_procedures[] = {
-  PROC(READ,            enc_read,       dec_read),
+        PROC(READ,              enc_read,               dec_read),
-  PROC(WRITE,           enc_write,      dec_write),
+        PROC(WRITE,             enc_write,              dec_write),
-  PROC(COMMIT,          enc_commit,     dec_commit),
+        PROC(COMMIT,            enc_commit,             dec_commit),
-  PROC(OPEN,            enc_open,       dec_open),
+        PROC(OPEN,              enc_open,               dec_open),
-  PROC(OPEN_CONFIRM,    enc_open_confirm,       dec_open_confirm),
+        PROC(OPEN_CONFIRM,      enc_open_confirm,       dec_open_confirm),
-  PROC(OPEN_NOATTR,     enc_open_noattr,        dec_open_noattr),
+        PROC(OPEN_NOATTR,       enc_open_noattr,        dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,  enc_open_downgrade,     dec_open_downgrade),
+        PROC(OPEN_DOWNGRADE,    enc_open_downgrade,     dec_open_downgrade),
-  PROC(CLOSE,           enc_close,      dec_close),
+        PROC(CLOSE,             enc_close,              dec_close),
-  PROC(SETATTR,         enc_setattr,    dec_setattr),
+        PROC(SETATTR,           enc_setattr,            dec_setattr),
-  PROC(FSINFO,          enc_fsinfo,     dec_fsinfo),
+        PROC(FSINFO,            enc_fsinfo,             dec_fsinfo),
-  PROC(RENEW,           enc_renew,      dec_renew),
+        PROC(RENEW,             enc_renew,              dec_renew),
-  PROC(SETCLIENTID,     enc_setclientid,        dec_setclientid),
+        PROC(SETCLIENTID,       enc_setclientid,        dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,     enc_setclientid_confirm,        dec_setclientid_confirm),
+        PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
+        PROC(LOCK,              enc_lock,               dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
+        PROC(LOCKT,             enc_lockt,              dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
+        PROC(LOCKU,             enc_locku,              dec_locku),
-  PROC(ACCESS,          enc_access,     dec_access),
+        PROC(ACCESS,            enc_access,             dec_access),
-  PROC(GETATTR,         enc_getattr,    dec_getattr),
+        PROC(GETATTR,           enc_getattr,            dec_getattr),
-  PROC(LOOKUP,          enc_lookup,     dec_lookup),
+        PROC(LOOKUP,            enc_lookup,             dec_lookup),
-  PROC(LOOKUP_ROOT,     enc_lookup_root,        dec_lookup_root),
+        PROC(LOOKUP_ROOT,       enc_lookup_root,        dec_lookup_root),
-  PROC(REMOVE,          enc_remove,     dec_remove),
+        PROC(REMOVE,            enc_remove,             dec_remove),
-  PROC(RENAME,          enc_rename,     dec_rename),
+        PROC(RENAME,            enc_rename,             dec_rename),
-  PROC(LINK,            enc_link,       dec_link),
+        PROC(LINK,              enc_link,               dec_link),
-  PROC(SYMLINK,         enc_symlink,    dec_symlink),
+        PROC(SYMLINK,           enc_symlink,            dec_symlink),
-  PROC(CREATE,          enc_create,     dec_create),
+        PROC(CREATE,            enc_create,             dec_create),
-  PROC(PATHCONF,        enc_pathconf,   dec_pathconf),
+        PROC(PATHCONF,          enc_pathconf,           dec_pathconf),
-  PROC(STATFS,          enc_statfs,     dec_statfs),
+        PROC(STATFS,            enc_statfs,             dec_statfs),
-  PROC(READLINK,        enc_readlink,   dec_readlink),
+        PROC(READLINK,          enc_readlink,           dec_readlink),
-  PROC(READDIR,         enc_readdir,    dec_readdir),
+        PROC(READDIR,           enc_readdir,            dec_readdir),
-  PROC(SERVER_CAPS,     enc_server_caps, dec_server_caps),
+        PROC(SERVER_CAPS,       enc_server_caps,        dec_server_caps),
-  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
+        PROC(DELEGRETURN,       enc_delegreturn,        dec_delegreturn),
-  PROC(GETACL,          enc_getacl,     dec_getacl),
+        PROC(GETACL,            enc_getacl,             dec_getacl),
-  PROC(SETACL,          enc_setacl,     dec_setacl),
+        PROC(SETACL,            enc_setacl,             dec_setacl),
-  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
-  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
-  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+        PROC(DESTROY_SESSION,   enc_destroy_session,    dec_destroy_session),
-  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+        PROC(SEQUENCE,          enc_sequence,           dec_sequence),
-  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+        PROC(GET_LEASE_TIME,    enc_get_lease_time,     dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
-  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
-  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
+        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63d..e1164e3f9e6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-        struct nfs_page *p;
+        struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
-        p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+        if (p)
-        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
-        }
        return p;
 }
@@ -115,7 +112,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 {
        if (!nfs_lock_request_dontget(req))
                return 0;
-        if (req->wb_page != NULL)
+        if (test_bit(PG_MAPPED, &req->wb_flags))
                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -125,7 +122,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        if (req->wb_page != NULL) {
+        if (test_bit(PG_MAPPED, &req->wb_flags)) {
                struct inode *inode = req->wb_context->path.dentry->d_inode;
                struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95..1b1bc1a0fb0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 * pNFS client layout cache
 */
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        atomic_inc(&lo->plh_refcount);
+}
 static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-        lo->refcount++;
+        BUG_ON(!list_empty(&lo->plh_layouts));
+        NFS_I(lo->plh_inode)->layout = NULL;
+        kfree(lo);
 }
 static void
 put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        if (atomic_dec_and_test(&lo->plh_refcount))
-        BUG_ON(lo->refcount == 0);
+                destroy_layout_hdr(lo);
-        lo->refcount--;
-        if (!lo->refcount) {
-                dprintk("%s: freeing layout cache %p\n", __func__, lo);
-                BUG_ON(!list_empty(&lo->layouts));
-                NFS_I(lo->inode)->layout = NULL;
-                kfree(lo);
-        }
 }
 void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        spin_lock(&inode->i_lock);
+        struct inode *inode = lo->plh_inode;
-        put_layout_hdr_locked(NFS_I(inode)->layout);
-        spin_unlock(&inode->i_lock);
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                destroy_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+        }
 }
 static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
-        INIT_LIST_HEAD(&lseg->fi_list);
+        INIT_LIST_HEAD(&lseg->pls_list);
-        kref_init(&lseg->kref);
+        atomic_set(&lseg->pls_refcount, 1);
-        lseg->layout = lo;
+        smp_mb();
+        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+        lseg->pls_layout = lo;
 }
-/* Called without i_lock held, as the free_lseg call may sleep */
+static void free_lseg(struct pnfs_layout_segment *lseg)
-static void
-destroy_lseg(struct kref *kref)
 {
-        struct pnfs_layout_segment *lseg =
+        struct inode *ino = lseg->pls_layout->plh_inode;
-                container_of(kref, struct pnfs_layout_segment, kref);
-        struct inode *ino = lseg->layout->inode;
-        dprintk("--> %s\n", __func__);
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        /* Matched by get_layout_hdr in pnfs_insert_layout */
-        put_layout_hdr(ino);
+        put_layout_hdr(NFS_I(ino)->layout);
 }
-static void
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
-put_lseg(struct pnfs_layout_segment *lseg)
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+                struct list_head *tmp_list)
+{
+        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+                atomic_read(&lseg->pls_refcount),
+                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                struct inode *ino = lseg->pls_layout->plh_inode;
+                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                list_del(&lseg->pls_list);
+                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                        struct nfs_client *clp;
+                        clp = NFS_SERVER(ino)->nfs_client;
+                        spin_lock(&clp->cl_lock);
+                        /* List does not take a reference, so no need for put here */
+                        list_del_init(&lseg->pls_layout->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+                }
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                list_add(&lseg->pls_list, tmp_list);
+                return 1;
+        }
+        return 0;
+}
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
-        if (!lseg)
+        return (recall_iomode == IOMODE_ANY ||
-                return;
+                lseg_iomode == recall_iomode);
+}
-        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+/* Returns 1 if lseg is removed from list, 0 otherwise */
-                atomic_read(&lseg->kref.refcount));
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
-        kref_put(&lseg->kref, destroy_lseg);
+                             struct list_head *tmp_list)
+{
+        int rv = 0;
+        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+                /* Remove the reference keeping the lseg in the
+                 * list.  It will now be removed when all
+                 * outstanding io is finished.
+                 */
+                rv = put_lseg_locked(lseg, tmp_list);
+        }
+        return rv;
 }
-static void
+/* Returns count of number of matching invalid lsegs remaining in list
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                            struct list_head *tmp_list,
+                            u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *next;
-        struct nfs_client *clp;
+        int invalid = 0, removed = 0;
        dprintk("%s:Begin lo %p\n", __func__, lo);
-        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
-                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                        dprintk("%s: freeing lseg %p iomode %d "
-                list_move(&lseg->fi_list, tmp_list);
+                                "offset %llu length %llu\n", __func__,
-        }
+                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-        clp = NFS_SERVER(lo->inode)->nfs_client;
+                                lseg->pls_range.length);
-        spin_lock(&clp->cl_lock);
+                        invalid++;
-        /* List does not take a reference, so no need for put here */
+                        removed += mark_lseg_invalid(lseg, tmp_list);
-        list_del_init(&lo->layouts);
+                }
-        spin_unlock(&clp->cl_lock);
+        dprintk("%s:Return %i\n", __func__, invalid - removed);
-        write_seqlock(&lo->seqlock);
+        return invalid - removed;
-        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("%s:Return\n", __func__);
 }
-static void
+void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
 {
-        struct pnfs_layout_segment *lseg;
+        struct pnfs_layout_segment *lseg, *tmp;
-        while (!list_empty(tmp_list)) {
+        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
-                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                list_del(&lseg->pls_list);
-                                fi_list);
+                free_lseg(lseg);
-                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-                list_del(&lseg->fi_list);
-                put_lseg(lseg);
        }
 }
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                pnfs_clear_lseg_list(lo, &tmp_list);
+                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
                put_layout_hdr_locked(lo);
        }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        while (!list_empty(&tmp_list)) {
                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-                                layouts);
+                                plh_layouts);
                dprintk("%s freeing layout for inode %lu\n", __func__,
-                        lo->inode->i_ino);
+                        lo->plh_inode->i_ino);
-                pnfs_destroy_layout(NFS_I(lo->inode));
+                pnfs_destroy_layout(NFS_I(lo->plh_inode));
        }
 }
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent */
- *
+void
- * lo->stateid could be the open stateid, in which case we just use what given.
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- */
+                        bool update_barrier)
-static void
+{
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+        u32 oldseq, newseq;
-                        const nfs4_stateid *new)
-{
+        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
-        nfs4_stateid *old = &lo->stateid;
+        newseq = be32_to_cpu(new->stateid.seqid);
-        bool overwrite = false;
+        if ((int)(newseq - oldseq) > 0) {
+                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
-        write_seqlock(&lo->seqlock);
+                if (update_barrier) {
-        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
-            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-                overwrite = true;
+                        if ((int)(new_barrier - lo->plh_barrier))
-        else {
+                                lo->plh_barrier = new_barrier;
-                u32 oldseq, newseq;
+                } else {
+                        /* Because of wraparound, we want to keep the barrier
-                oldseq = be32_to_cpu(old->stateid.seqid);
+                         * "close" to the current seqids.  It needs to be
-                newseq = be32_to_cpu(new->stateid.seqid);
+                         * within 2**31 to count as "behind", so if it
-                if ((int)(newseq - oldseq) > 0)
+                         * gets too near that limit, give us a litle leeway
-                        overwrite = true;
+                         * and bring it to within 2**30.
+                         * NOTE - and yes, this is all unsigned arithmetic.
+                         */
+                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                                lo->plh_barrier = newseq - (1 << 30);
+                }
        }
-        if (overwrite)
-                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-        write_sequnlock(&lo->seqlock);
 }
-static void
+/* lget is set to 1 if called from inside send_layoutget call chain */
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+static bool
-                              struct nfs4_state *state)
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+                        int lget)
 {
-        int seq;
+        if ((stateid) &&
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
-        dprintk("--> %s\n", __func__);
+                return true;
-        write_seqlock(&lo->seqlock);
+        return lo->plh_block_lgets ||
-        do {
+                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-                seq = read_seqbegin(&state->seqlock);
+                (list_empty(&lo->plh_segs) &&
-                memcpy(lo->stateid.data, state->stateid.data,
+                 (atomic_read(&lo->plh_outstanding) > lget));
-                       sizeof(state->stateid.data));
-        } while (read_seqretry(&state->seqlock, seq));
-        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("<-- %s\n", __func__);
 }
-void
+int
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                        struct nfs4_state *open_state)
+                              struct nfs4_state *open_state)
 {
-        int seq;
+        int status = 0;
        dprintk("--> %s\n", __func__);
-        do {
+        spin_lock(&lo->plh_inode->i_lock);
-                seq = read_seqbegin(&lo->seqlock);
+        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
-                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                status = -EAGAIN;
-                        /* This will trigger retry of the read */
+        } else if (list_empty(&lo->plh_segs)) {
-                        pnfs_layout_from_open_stateid(lo, open_state);
+                int seq;
-                } else
-                        memcpy(dst->data, lo->stateid.data,
+                do {
-                               sizeof(lo->stateid.data));
+                        seq = read_seqbegin(&open_state->seqlock);
-        } while (read_seqretry(&lo->seqlock, seq));
+                        memcpy(dst->data, open_state->stateid.data,
+                               sizeof(open_state->stateid.data));
+                } while (read_seqretry(&open_state->seqlock, seq));
+        } else
+                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
+        return status;
 }
 /*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
           u32 iomode)
 {
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        BUG_ON(ctx == NULL);
        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-        if (lgp == NULL) {
+        if (lgp == NULL)
-                put_layout_hdr(lo->inode);
                return NULL;
-        }
        lgp->args.minlength = NFS4_MAX_UINT64;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-                set_bit(lo_fail_bit(iomode), &lo->state);
+                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
        }
        return lseg;
 }
+bool pnfs_roc(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg, *tmp;
+        LIST_HEAD(tmp_list);
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+                goto out_nolayout;
+        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        mark_lseg_invalid(lseg, &tmp_list);
+                        found = true;
+                }
+        if (!found)
+                goto out_nolayout;
+        lo->plh_block_lgets++;
+        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        return true;
+out_nolayout:
+        spin_unlock(&ino->i_lock);
+        return false;
+}
+void pnfs_roc_release(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        lo->plh_block_lgets--;
+        put_layout_hdr_locked(lo);
+        spin_unlock(&ino->i_lock);
+}
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if ((int)(barrier - lo->plh_barrier) > 0)
+                lo->plh_barrier = barrier;
+        spin_unlock(&ino->i_lock);
+}
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_segment *lseg;
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        found = true;
+                        break;
+                }
+        if (!found) {
+                struct pnfs_layout_hdr *lo = nfsi->layout;
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                /* Since close does not return a layout stateid for use as
+                 * a barrier, we choose the worst-case barrier.
+                 */
+                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        }
+        spin_unlock(&ino->i_lock);
+        return found;
+}
 /*
 * Compare two layout segments for sorting into layout cache.
 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        if (list_empty(&lo->segs)) {
+        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
-                spin_lock(&clp->cl_lock);
-                BUG_ON(!list_empty(&lo->layouts));
-                list_add_tail(&lo->layouts, &clp->cl_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
-        list_for_each_entry(lp, &lo->segs, fi_list) {
-                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
                        continue;
-                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu before "
                        "lp %p iomode %d offset %llu length %llu\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length,
+                        lseg->pls_range.offset, lseg->pls_range.length,
-                        lp, lp->range.iomode, lp->range.offset,
+                        lp, lp->pls_range.iomode, lp->pls_range.offset,
-                        lp->range.length);
+                        lp->pls_range.length);
                found = 1;
                break;
        }
        if (!found) {
-                list_add_tail(&lseg->fi_list, &lo->segs);
+                list_add_tail(&lseg->pls_list, &lo->plh_segs);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu at tail\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length);
+                        lseg->pls_range.offset, lseg->pls_range.length);
        }
-        get_layout_hdr_locked(lo);
+        get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
 }
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
        if (!lo)
                return NULL;
-        lo->refcount = 1;
+        atomic_set(&lo->plh_refcount, 1);
-        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->plh_layouts);
-        INIT_LIST_HEAD(&lo->segs);
+        INIT_LIST_HEAD(&lo->plh_segs);
-        seqlock_init(&lo->seqlock);
+        INIT_LIST_HEAD(&lo->plh_bulk_recall);
-        lo->inode = ino;
+        lo->plh_inode = ino;
        return lo;
 }
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
        assert_spin_locked(&ino->i_lock);
-        if (nfsi->layout)
+        if (nfsi->layout) {
-                return nfsi->layout;
+                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+                        return NULL;
+                else
+                        return nfsi->layout;
+        }
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino);
        spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
 static int
 is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 {
-        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 }
 /*
 * lookup range in layout
 */
 static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        list_for_each_entry(lseg, &lo->segs, fi_list) {
+        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
-                if (is_matching_lseg(lseg, iomode)) {
+                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    is_matching_lseg(lseg, iomode)) {
                        ret = lseg;
                        break;
                }
-                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
                        break;
        }
        dprintk("%s:Return lseg %p ref %d\n",
-                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
        return ret;
 }
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
                   enum pnfs_iomode iomode)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
+        /* Do we even need to bother with this? */
-        lseg = pnfs_has_layout(lo, iomode);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
-        if (lseg) {
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                dprintk("%s matches recall, use MDS\n", __func__);
-                        __func__, lseg, iomode);
                goto out_unlock;
        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
-        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+                goto out_unlock;
+        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
+        atomic_inc(&lo->plh_outstanding);
-        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        get_layout_hdr(lo);
+        if (list_empty(&lo->plh_segs)) {
+                /* The lo must be on the clp list if there is any
+                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+                 */
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->plh_layouts));
+                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
+        if (!lseg) {
+                spin_lock(&ino->i_lock);
+                if (list_empty(&lo->plh_segs)) {
+                        spin_lock(&clp->cl_lock);
+                        list_del_init(&lo->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                }
+                spin_unlock(&ino->i_lock);
+        }
+        atomic_dec(&lo->plh_outstanding);
+        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->state, lseg);
+                nfsi->layout->plh_flags, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
+        /* Verify we got what we asked for.
+         * Note that because the xdr parsing only accepts a single
+         * element array, this can fail even if the server is behaving
+         * correctly.
+         */
+        if (lgp->args.range.iomode > res->range.iomode ||
+            res->range.offset != 0 ||
+            res->range.length != NFS4_MAX_UINT64) {
+                status = -EINVAL;
+                goto out;
+        }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
        if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_lock(&ino->i_lock);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s forget reply due to recall\n", __func__);
+                goto out_forget_reply;
+        }
+        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+                dprintk("%s forget reply due to state\n", __func__);
+                goto out_forget_reply;
+        }
        init_lseg(lo, lseg);
-        lseg->range = res->range;
+        lseg->pls_range = res->range;
        *lgp->lsegpp = lseg;
        pnfs_insert_layout(lo, lseg);
+        if (res->return_on_close) {
+                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+        }
        /* Done processing layoutget. Set the layout stateid */
-        pnfs_set_layout_stateid(lo, &res->stateid);
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
 out:
        return status;
+out_forget_reply:
+        spin_unlock(&ino->i_lock);
+        lseg->pls_layout = lo;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        goto out;
 }
 /*
@@ -769,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
-        dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+        dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
                int i;
                /* Verify cache is empty */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d5048..e2612ea0cbe 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+enum {
+        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
+        NFS_LSEG_ROC,           /* roc bit received from server */
+};
 struct pnfs_layout_segment {
-        struct list_head fi_list;
+        struct list_head pls_list;
-        struct pnfs_layout_range range;
+        struct pnfs_layout_range pls_range;
-        struct kref kref;
+        atomic_t pls_refcount;
-        struct pnfs_layout_hdr *layout;
+        unsigned long pls_flags;
+        struct pnfs_layout_hdr *pls_layout;
 };
 #ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
-        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
+        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
+        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
 /* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
 };
 struct pnfs_layout_hdr {
-        unsigned long           refcount;
+        atomic_t                plh_refcount;
-        struct list_head        layouts;   /* other client layouts */
+        struct list_head        plh_layouts;   /* other client layouts */
-        struct list_head        segs;      /* layout segments list */
+        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
-        seqlock_t               seqlock;   /* Protects the stateid */
+        struct list_head        plh_segs;      /* layout segments list */
-        nfs4_stateid            stateid;
+        nfs4_stateid            plh_stateid;
-        unsigned long           state;
+        atomic_t                plh_outstanding; /* number of RPCs out */
-        struct inode            *inode;
+        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
+        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_flags;
+        struct inode            *plh_inode;
 };
 struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-                             struct nfs4_state *open_state);
+                             const nfs4_stateid *new,
+                             bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+                                  struct pnfs_layout_hdr *lo,
+                                  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
        return NULL;
 }
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+        return false;
+}
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        return false;
+}
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1f..77d5e21c4ad 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        fattr = nfs_alloc_fattr();
        status = -ENOMEM;
        if (fh == NULL || fattr == NULL)
-                goto out;
+                goto out_free;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        if (status == 0)
                status = nfs_instantiate(dentry, fh, fattr);
+out_free:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .statfs         = nfs_proc_statfs,
        .fsinfo         = nfs_proc_fsinfo,
        .pathconf       = nfs_proc_pathconf,
-        .decode_dirent  = nfs_decode_dirent,
+        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6..aedcaa7f291 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adc..b68c8607770 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
@@ -593,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_version || showdefaults)
                seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-        if (nfss->mountd_port || showdefaults)
+        if ((nfss->mountd_port &&
+                nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+                showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
        nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -1064,12 +1071,10 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_VER3;
                        mnt->version = 3;
                        break;
-#ifdef CONFIG_NFS_V4
                case Opt_v4:
                        mnt->flags &= ~NFS_MOUNT_VER3;
                        mnt->version = 4;
                        break;
-#endif
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1286,10 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                mnt->version = 3;
                                break;
-#ifdef CONFIG_NFS_V4
                        case NFS4_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                mnt->version = 4;
                                break;
-#endif
                        default:
                                goto out_invalid_value;
                        }
@@ -2199,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
        s->s_flags = sb_mntdata->mntflags;
        s->s_fs_info = server;
+        s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
        ret = set_anon_super(s, server);
        if (ret == 0)
                server->s_dev = s->s_dev;
@@ -2277,7 +2281,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = nfs_alloc_parsed_mount_data(3);
+        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2493,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
        sb->s_op = old_sb->s_op;
-        nfs_initialise_sb(sb);
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr  = old_sb->s_xattr;
+        nfs_initialise_sb(sb);
 }
 /*
@@ -2503,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb)
 {
        sb->s_time_gran = 1;
        sb->s_op = &nfs4_sops;
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr = nfs4_xattr_handlers;
        nfs_initialise_sb(sb);
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7bdec853140..e313a51acdd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return ERR_PTR(-ENOMEM);
-        task_setup_data.callback_data = data,
+        task_setup_data.callback_data = data;
        data->cred = rpc_lookup_cred();
        if (IS_ERR(data->cred)) {
@@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                atomic_read(&dentry->d_count));
+                dentry->d_count);
        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
        /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a527..c8278f4046c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                if (nfs_have_delegation(inode, FMODE_WRITE))
                        nfsi->change_attr++;
        }
+        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
+        clear_bit(PG_MAPPED, &req->wb_flags);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                iput(inode);
        } else
                spin_unlock(&inode->i_lock);
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
@@ -931,7 +932,7 @@ out_bad:
        while (!list_empty(&list)) {
                data = list_entry(list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
        nfs_redirty_request(req);
        return -ENOMEM;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c0..84c27d69d42 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
        gid_t gid;
 };
+struct nfsacl_simple_acl {
+        struct posix_acl acl;
+        struct posix_acl_entry ace[4];
+};
 static int
 xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
        return 0;
 }
-unsigned int
+/**
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ * nfsacl_encode - Encode an NFSv3 ACL
-              struct posix_acl *acl, int encode_entries, int typeflag)
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+                  struct posix_acl *acl, int encode_entries, int typeflag)
 {
        int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
        struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                .uid = inode->i_uid,
                .gid = inode->i_gid,
        };
+        struct nfsacl_simple_acl aclbuf;
        int err;
-        struct posix_acl *acl2 = NULL;
        if (entries > NFS_ACL_MAX_ENTRIES ||
            xdr_encode_word(buf, base, entries))
                return -EINVAL;
        if (encode_entries && acl && acl->a_count == 3) {
-                /* Fake up an ACL_MASK entry. */
+                struct posix_acl *acl2 = &aclbuf.acl;
-                acl2 = posix_acl_alloc(4, GFP_KERNEL);
-                if (!acl2)
+                /* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
-                        return -ENOMEM;
+                 * invoked in contexts where a memory allocation failure is
+                 * fatal.  Fortunately this fake ACL is small enough to
+                 * construct on the stack. */
+                memset(acl2, 0, sizeof(acl2));
+                posix_acl_init(acl2, 4);
                /* Insert entries in canonical order: other orders seem
                 to confuse Solaris VxFS. */
                acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                nfsacl_desc.acl = acl2;
        }
        err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
-        if (acl2)
-                posix_acl_release(acl2);
        if (!err)
                err = 8 + nfsacl_desc.desc.elem_size *
                          nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
        return 0;
 }
-unsigned int
+/**
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ * nfsacl_decode - Decode an NFSv3 ACL
-              struct posix_acl **pacl)
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+                  struct posix_acl **pacl)
 {
        struct nfsacl_decode_desc nfsacl_desc = {
                .desc = {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 00000000000..34e5c40af5e
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+#include <linux/posix_acl.h>
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+                                        uid_t who, u32 mask);
+#define NFS4_ACL_TYPE_DEFAULT   0x01
+#define NFS4_ACL_DIR            0x02
+#define NFS4_ACL_OWNER          0x04
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+                                struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+                                struct posix_acl **, unsigned int flags);
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6..8b31e5f8795 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * NFS exporting and validation.
 *
@@ -1444,9 +1443,6 @@ static struct flags {
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-        { NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
        { 0, {"", ""}}
 };
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 00000000000..2f3be132153
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+        return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06..2247fc91d5e 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
        __be32  nfserr;
        u32     max_blocksize = svc_max_payload(rqstp);
-        dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+        dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                SVCFH_fmt(&argp->fh),
                                (unsigned long) argp->count,
-                                (unsigned long) argp->offset);
+                                (unsigned long long) argp->offset);
        /* Obtain buffer pointer for payload.
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        __be32  nfserr;
        unsigned long cnt = argp->len;
-        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
+        dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
                                SVCFH_fmt(&argp->fh),
                                argp->len,
-                                (unsigned long) argp->offset,
+                                (unsigned long long) argp->offset,
                                argp->stable? " stable" : "");
        fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a..7e84a852cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
-        if (err)
+        if (err) {
                fhp->fh_post_saved = 0;
-        else
+                /* Grab the ctime anyway - set_change_info might use it */
+                fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+        } else
                fhp->fh_post_saved = 1;
 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e4805261515..ad88f1c0a4c 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7..3be975e1891 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
-enum nfs_cb_opnum4 {
-        OP_CB_RECALL            = 4,
-        OP_CB_SEQUENCE          = 11,
-};
 #define NFS4_MAXTAGLEN          20
 #define NFS4_enc_cb_null_sz             0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-        int tmp = XDR_QUADLEN(nbytes);
-        if (!tmp)
-                return p;
-        p[tmp-1] = 0;
-        memcpy(p, ptr, nbytes);
-        return p + tmp;
-}
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-        BUG_ON(!p);                                             \
-} while (0)
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-        status = 0;                             \
-out:                                            \
-        return status;                          \
-xdr_error:                                      \
-        dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-        status = -EIO;                          \
-        goto out
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (!p) { \
-                dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-                        __func__, __LINE__); \
-                return -EIO; \
-        } \
-} while (0)
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
        int             status;
 };
-static struct {
+/*
-int stat;
+ * Handle decode buffer overflows out-of-line.
-int errno;
+ */
-} nfs_cb_errtbl[] = {
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-        { NFS4_OK,              0               },
+{
-        { NFS4ERR_PERM,         EPERM           },
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        { NFS4ERR_NOENT,        ENOENT          },
+                "Remaining buffer length is %tu words.\n",
-        { NFS4ERR_IO,           EIO             },
+                func, xdr->end - xdr->p);
-        { NFS4ERR_NXIO,         ENXIO           },
+}
-        { NFS4ERR_ACCESS,       EACCES          },
-        { NFS4ERR_EXIST,        EEXIST          },
-        { NFS4ERR_XDEV,         EXDEV           },
-        { NFS4ERR_NOTDIR,       ENOTDIR         },
-        { NFS4ERR_ISDIR,        EISDIR          },
-        { NFS4ERR_INVAL,        EINVAL          },
-        { NFS4ERR_FBIG,         EFBIG           },
-        { NFS4ERR_NOSPC,        ENOSPC          },
-        { NFS4ERR_ROFS,         EROFS           },
-        { NFS4ERR_MLINK,        EMLINK          },
-        { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
-        { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
-        { NFS4ERR_DQUOT,        EDQUOT          },
-        { NFS4ERR_STALE,        ESTALE          },
-        { NFS4ERR_BADHANDLE,    EBADHANDLE      },
-        { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
-        { NFS4ERR_NOTSUPP,      ENOTSUPP        },
-        { NFS4ERR_TOOSMALL,     ETOOSMALL       },
-        { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
-        { NFS4ERR_BADTYPE,      EBADTYPE        },
-        { NFS4ERR_LOCKED,       EAGAIN          },
-        { NFS4ERR_RESOURCE,     EREMOTEIO       },
-        { NFS4ERR_SYMLINK,      ELOOP           },
-        { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
-        { NFS4ERR_DEADLOCK,     EDEADLK         },
-        { -1,                   EIO             }
-};
-static int
+static __be32 *xdr_encode_empty_array(__be32 *p)
-nfs_cb_stat_to_errno(int stat)
 {
-        int i;
+        *p++ = xdr_zero;
-        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+        return p;
-                if (nfs_cb_errtbl[i].stat == stat)
-                        return nfs_cb_errtbl[i].errno;
-        }
-        /* If we cannot translate the error, the recovery routines should
-        * handle it.
-        * Note: remaining NFSv4 error codes have values > 10000, so should
-        * not conflict with native Linux error codes.
-        */
-        return stat;
 }
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
 */
-static void
+/*
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+ *      nfs_cb_opnum4
+ *
+ *      enum nfs_cb_opnum4 {
+ *              OP_CB_GETATTR           = 3,
+ *                ...
+ *      };
+ */
+enum nfs_cb_opnum4 {
+        OP_CB_GETATTR                   = 3,
+        OP_CB_RECALL                    = 4,
+        OP_CB_LAYOUTRECALL              = 5,
+        OP_CB_NOTIFY                    = 6,
+        OP_CB_PUSH_DELEG                = 7,
+        OP_CB_RECALL_ANY                = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL      = 9,
+        OP_CB_RECALL_SLOT               = 10,
+        OP_CB_SEQUENCE                  = 11,
+        OP_CB_WANTS_CANCELLED           = 12,
+        OP_CB_NOTIFY_LOCK               = 13,
+        OP_CB_NOTIFY_DEVICEID           = 14,
+        OP_CB_ILLEGAL                   = 10044
+};
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
 {
        __be32 *p;
-        RESERVE_SPACE(sizeof(stateid_t));
+        p = xdr_reserve_space(xdr, 4);
-        WRITE32(sid->si_generation);
+        *p = cpu_to_be32(op);
-        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
 }
-static void
+/*
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+ * nfs_fh4
+ *
+ *      typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-        __be32 * p;
+        u32 length = fh->fh_size;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        BUG_ON(length > NFS4_FHSIZE);
-        WRITE32(0);            /* tag length is always 0 */
+        p = xdr_reserve_space(xdr, 4 + length);
-        WRITE32(hdr->minorversion);
+        xdr_encode_opaque(p, &fh->fh_base, length);
-        WRITE32(hdr->ident);
-        hdr->nops_p = p;
-        WRITE32(hdr->nops);
 }
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *      struct stateid4 {
+ *              uint32_t        seqid;
+ *              opaque          other[12];
+ *      };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-        *hdr->nops_p = htonl(hdr->nops);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(sid->si_generation);
+        xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
-static void
+/*
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+ * sessionid4
-                struct nfs4_cb_compound_hdr *hdr)
+ *
+ *      typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+                              const struct nfsd4_session *session)
 {
        __be32 *p;
-        int len = dp->dl_fh.fh_size;
+        p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
-        RESERVE_SPACE(4);
+        xdr_encode_opaque_fixed(p, session->se_sessionid.data,
-        WRITE32(OP_CB_RECALL);
+                                        NFS4_MAX_SESSIONID_LEN);
-        encode_stateid(xdr, &dp->dl_stateid);
-        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-        WRITE32(0); /* truncate optimization not implemented */
-        WRITE32(len);
-        WRITEMEM(&dp->dl_fh.fh_base, len);
-        hdr->nops++;
 }
-static void
+/*
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+ * nfsstat4
-                   struct nfs4_cb_compound_hdr *hdr)
+ */
-{
+static const struct {
-        __be32 *p;
+        int stat;
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        int errno;
+} nfs_cb_errtbl[] = {
+        { NFS4_OK,              0               },
+        { NFS4ERR_PERM,         -EPERM          },
+        { NFS4ERR_NOENT,        -ENOENT         },
+        { NFS4ERR_IO,           -EIO            },
+        { NFS4ERR_NXIO,         -ENXIO          },
+        { NFS4ERR_ACCESS,       -EACCES         },
+        { NFS4ERR_EXIST,        -EEXIST         },
+        { NFS4ERR_XDEV,         -EXDEV          },
+        { NFS4ERR_NOTDIR,       -ENOTDIR        },
+        { NFS4ERR_ISDIR,        -EISDIR         },
+        { NFS4ERR_INVAL,        -EINVAL         },
+        { NFS4ERR_FBIG,         -EFBIG          },
+        { NFS4ERR_NOSPC,        -ENOSPC         },
+        { NFS4ERR_ROFS,         -EROFS          },
+        { NFS4ERR_MLINK,        -EMLINK         },
+        { NFS4ERR_NAMETOOLONG,  -ENAMETOOLONG   },
+        { NFS4ERR_NOTEMPTY,     -ENOTEMPTY      },
+        { NFS4ERR_DQUOT,        -EDQUOT         },
+        { NFS4ERR_STALE,        -ESTALE         },
+        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
+        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
+        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
+        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
+        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_BADTYPE,      -EBADTYPE       },
+        { NFS4ERR_LOCKED,       -EAGAIN         },
+        { NFS4ERR_RESOURCE,     -EREMOTEIO      },
+        { NFS4ERR_SYMLINK,      -ELOOP          },
+        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
+        { NFS4ERR_DEADLOCK,     -EDEADLK        },
+        { -1,                   -EIO            }
+};
-        if (hdr->minorversion == 0)
+/*
-                return;
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+        int i;
-        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+                if (nfs_cb_errtbl[i].stat == status)
+                        return nfs_cb_errtbl[i].errno;
+        }
-        WRITE32(OP_CB_SEQUENCE);
+        dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
-        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        return -status;
-        WRITE32(ses->se_cb_seq_nr);
-        WRITE32(0);             /* slotid, always 0 */
-        WRITE32(0);             /* highest slotid always 0 */
-        WRITE32(0);             /* cachethis always 0 */
-        WRITE32(0); /* FIXME: support referring_call_lists */
-        hdr->nops++;
 }
-static int
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+                               enum nfsstat4 *status)
 {
-        struct xdr_stream xdrs, *xdr = &xdrs;
+        __be32 *p;
+        u32 op;
-        xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        RESERVE_SPACE(0);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        op = be32_to_cpup(p++);
+        if (unlikely(op != expected))
+                goto out_unexpected;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_unexpected:
+        dprintk("NFSD: Callback server returned operation %d but "
+                "we issued a request for %d\n", op, expected);
+        return -EIO;
 }
-static int
+/*
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ * CB_COMPOUND4args
-                struct nfsd4_callback *cb)
+ *
+ *      struct CB_COMPOUND4args {
+ *              utf8str_cs      tag;
+ *              uint32_t        minorversion;
+ *              uint32_t        callback_ident;
+ *              nfs_cb_argop4   argarray<>;
+ *      };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
-        struct xdr_stream xdr;
+        __be32 * p;
-        struct nfs4_delegation *args = cb->cb_op;
-        struct nfs4_cb_compound_hdr hdr = {
-                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = cb->cb_minorversion,
-        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        encode_cb_compound_hdr(&xdr, &hdr);
+        p = xdr_encode_empty_array(p);          /* empty tag */
-        encode_cb_sequence(&xdr, cb, &hdr);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        encode_cb_recall(&xdr, args, &hdr);
+        *p++ = cpu_to_be32(hdr->ident);
-        encode_cb_nops(&hdr);
+        hdr->nops_p = p;
+        *p = cpu_to_be32(hdr->nops);            /* argarray element count */
+}
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+        BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+        *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+/*
+ * CB_COMPOUND4res
+ *
+ *      struct CB_COMPOUND4res {
+ *              nfsstat4        status;
+ *              utf8str_cs      tag;
+ *              nfs_cb_resop4   resarray<>;
+ *      };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        /* Ignore the tag */
+        length = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, length + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->nops = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * CB_RECALL4args
+ *
+ *      struct CB_RECALL4args {
+ *              stateid4        stateid;
+ *              bool            truncate;
+ *              nfs_fh4         fh;
+ *      };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+                                  const struct nfs4_delegation *dp,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+        encode_stateid4(xdr, &dp->dl_stateid);
+        p = xdr_reserve_space(xdr, 4);
+        *p++ = xdr_zero;                        /* truncate */
-static int
+        encode_nfs_fh4(xdr, &dp->dl_fh);
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        __be32 *p;
-        u32 taglen;
-        READ_BUF(8);
+        hdr->nops++;
-        READ32(hdr->status);
-        /* We've got no use for the tag; ignore it: */
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
 }
-static int
+/*
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+ * CB_SEQUENCE4args
+ *
+ *      struct CB_SEQUENCE4args {
+ *              sessionid4              csa_sessionid;
+ *              sequenceid4             csa_sequenceid;
+ *              slotid4                 csa_slotid;
+ *              slotid4                 csa_highest_slotid;
+ *              bool                    csa_cachethis;
+ *              referring_call_list4    csa_referring_call_lists<>;
+ *      };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+                                    const struct nfsd4_callback *cb,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        __be32 *p;
-        u32 op;
-        int32_t nfserr;
+        if (hdr->minorversion == 0)
+                return;
-        READ_BUF(8);
-        READ32(op);
+        encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
-        if (op != expected) {
+        encode_sessionid4(xdr, session);
-                dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-                         " operation %d but we issued a request for %d\n",
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-                         op, expected);
+        *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
-                return -EIO;
+        *p++ = xdr_zero;                        /* csa_slotid */
-        }
+        *p++ = xdr_zero;                        /* csa_highest_slotid */
-        READ32(nfserr);
+        *p++ = xdr_zero;                        /* csa_cachethis */
-        if (nfserr != NFS_OK)
+        xdr_encode_empty_array(p);              /* csa_referring_call_lists */
-                return -nfs_cb_stat_to_errno(nfserr);
-        return 0;
+        hdr->nops++;
 }
 /*
+ * CB_SEQUENCE4resok
+ *
+ *      struct CB_SEQUENCE4resok {
+ *              sessionid4      csr_sessionid;
+ *              sequenceid4     csr_sequenceid;
+ *              slotid4         csr_slotid;
+ *              slotid4         csr_highest_slotid;
+ *              slotid4         csr_target_highest_slotid;
+ *      };
+ *
+ *      union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *      case NFS4_OK:
+ *              CB_SEQUENCE4resok       csr_resok4;
+ *      default:
+ *              void;
+ *      };
+ *
 * Our current back channel implmentation supports a single backchannel
 * with a single slot.
 */
-static int
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+                                    struct nfsd4_callback *cb)
-                   struct rpc_rqst *rqstp)
 {
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
-        u32 dummy;
        __be32 *p;
+        u32 dummy;
-        if (cb->cb_minorversion == 0)
+        status = -ESERVERFAULT;
-                return 0;
-        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-        if (status)
-                return status;
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        if (unlikely(p == NULL))
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, session->se_sessionid.data,
-        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
+                                        NFS4_MAX_SESSIONID_LEN) != 0) {
-                dprintk("%s Invalid session id\n", __func__);
+                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
-        READ32(dummy);
+        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (dummy != ses->se_cb_seq_nr) {
-                dprintk("%s Invalid sequence number\n", __func__);
+        dummy = be32_to_cpup(p++);
+        if (dummy != session->se_cb_seq_nr) {
+                dprintk("NFS: %s Invalid sequence number\n", __func__);
                goto out;
        }
-        READ32(dummy);  /* slotid must be 0 */
+        dummy = be32_to_cpup(p++);
        if (dummy != 0) {
-                dprintk("%s Invalid slotid\n", __func__);
+                dprintk("NFS: %s Invalid slotid\n", __func__);
                goto out;
        }
-        /* FIXME: process highest slotid and target highest slotid */
+        /*
+         * FIXME: process highest slotid and target highest slotid
+         */
        status = 0;
 out:
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        enum nfsstat4 nfserr;
+        int status;
+        if (cb->cb_minorversion == 0)
+                return 0;
-static int
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
+        status = decode_cb_sequence4resok(xdr, cb);
+out:
+        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 void *__unused)
+{
+        xdr_reserve_space(xdr, 0);
+}
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_delegation *args = cb->cb_op;
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = cb->cb_clp->cl_cb_ident,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                void *__unused)
 {
        return 0;
 }
-static int
+/*
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
-                struct nfsd4_callback *cb)
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
 {
-        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_cb_compound4res(xdr, &hdr);
-        status = decode_cb_compound_hdr(&xdr, &hdr);
+        if (unlikely(status))
-        if (status)
                goto out;
-        if (cb) {
-                status = decode_cb_sequence(&xdr, cb, rqstp);
+        if (cb != NULL) {
-                if (status)
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
                        goto out;
        }
-        status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
 out:
        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
 }
 /*
 * RPC procedure tables
 */
-#define PROC(proc, call, argtype, restype)                              \
+#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                              \
+[NFSPROC4_CLNT_##proc] = {                                              \
-        .p_proc   = NFSPROC4_CB_##call,                                 \
+        .p_proc    = NFSPROC4_CB_##call,                                \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
+        .p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,               \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
+        .p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,               \
-        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_arglen  = NFS4_enc_##argtype##_sz,                           \
-        .p_replen = NFS4_##restype##_sz,                                \
+        .p_replen  = NFS4_dec_##restype##_sz,                           \
-        .p_statidx = NFSPROC4_CB_##call,                                \
+        .p_statidx = NFSPROC4_CB_##call,                                \
-        .p_name   = #proc,                                              \
+        .p_name    = #proc,                                             \
-}
+}
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
+        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
 };
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
 * Note on the callback rpc program version number: despite language in rfc
 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
 * in practice that appears to be what implementations use.  The section
 * 18.36.3 language is expected to be fixed in an erratum.
 */
-        .number                 = 1,
+        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
+        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *     nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
 static struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
-                .program        = &cb_program
+        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-                .name           = "nfs4_cb",
+        .name                   = "nfs4_cb",
-                .number         = NFS4_CALLBACK,
+        .number                 = NFS4_CALLBACK,
-                .nrvers         = ARRAY_SIZE(nfs_cb_version),
+        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
-                .version        = nfs_cb_version,
+        .version                = nfs_cb_version,
-                .stats          = &cb_stats,
+        .stats                  = &cb_stats,
-                .pipe_dir_name  = "/nfsd4_cb",
+        .pipe_dir_name          = "/nfsd4_cb",
 };
 static int max_cb_time(void)
@@ -470,10 +628,8 @@ static int max_cb_time(void)
        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
@@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
                .net            = &init_net,
                .address        = (struct sockaddr *) &conn->cb_addr,
                .addrsize       = conn->cb_addrlen,
+                .saddress       = (struct sockaddr *) &conn->cb_saddr,
                .timeout        = &timeparms,
                .program        = &cb_program,
                .version        = 0,
@@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
                args.protocol = XPRT_TRANSPORT_TCP;
                clp->cl_cb_ident = conn->cb_ident;
        } else {
+                if (!conn->cb_xprt)
+                        return -EINVAL;
+                clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+                clp->cl_cb_session = ses;
                args.bc_xprt = conn->cb_xprt;
                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
                (int)clp->cl_name.len, clp->cl_name.data, reason);
 }
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+        clp->cl_cb_state = NFSD4_CB_DOWN;
+        warn_no_callback_path(clp, reason);
+}
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
        if (task->tk_status)
-                warn_no_callback_path(clp, task->tk_status);
+                nfsd4_mark_cb_down(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_set, 1);
+                clp->cl_cb_state = NFSD4_CB_UP;
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -551,6 +718,11 @@ int set_callback_cred(void)
 static struct workqueue_struct *callback_wq;
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+        queue_work(callback_wq, &cb->cb_work);
+}
 static void do_probe_callback(struct nfs4_client *clp)
 {
        struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
        cb->cb_ops = &nfsd4_cb_probe_ops;
-        queue_work(callback_wq, &cb->cb_work);
+        run_nfsd4_cb(cb);
 }
 /*
@@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
 */
 void nfsd4_probe_callback(struct nfs4_client *clp)
 {
+        /* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
        do_probe_callback(clp);
 }
-void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
 {
-        BUG_ON(atomic_read(&clp->cl_cb_set));
+        nfsd4_probe_callback(clp);
+        flush_workqueue(callback_wq);
+}
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        spin_lock(&clp->cl_lock);
        memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
        spin_unlock(&clp->cl_lock);
@@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 * If the slot is available, then mark it busy.  Otherwise, set the
 * thread for sleeping on the callback RPC wait queue.
 */
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
-                struct rpc_task *task)
 {
-        u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-        int status = 0;
-        dprintk("%s: %u:%u:%u:%u\n", __func__,
-                ptr[0], ptr[1], ptr[2], ptr[3]);
        if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
                rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
                dprintk("%s slot is busy\n", __func__);
-                status = -EAGAIN;
+                return false;
-                goto out;
        }
-out:
+        return true;
-        dprintk("%s status=%d\n", __func__, status);
-        return status;
 }
 /*
@@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        u32 minorversion = clp->cl_minorversion;
-        int status = 0;
        cb->cb_minorversion = minorversion;
        if (minorversion) {
-                status = nfsd41_cb_setup_sequence(clp, task);
+                if (!nfsd41_cb_get_slot(clp, task))
-                if (status) {
-                        if (status != -EAGAIN) {
-                                /* terminate rpc task */
-                                task->tk_status = status;
-                                task->tk_action = NULL;
-                        }
                        return;
-                }
        }
+        spin_lock(&clp->cl_lock);
+        if (list_empty(&cb->cb_per_client)) {
+                /* This is the first call, not a restart */
+                cb->cb_done = false;
+                list_add(&cb->cb_per_client, &clp->cl_callbacks);
+        }
+        spin_unlock(&clp->cl_lock);
        rpc_call_start(task);
 }
@@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
        nfsd4_cb_done(task, calldata);
-        if (current_rpc_client == NULL) {
+        if (current_rpc_client != task->tk_client) {
-                /* We're shutting down; give up. */
+                /* We're shutting down or changing cl_cb_client; leave
-                /* XXX: err, or is it ok just to fall through
+                 * it to nfsd4_process_cb_update to restart the call if
-                 * and rpc_restart_call? */
+                 * necessary. */
                return;
        }
+        if (cb->cb_done)
+                return;
        switch (task->tk_status) {
        case 0:
+                cb->cb_done = true;
                return;
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
@@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                break;
        default:
                /* Network partition? */
-                atomic_set(&clp->cl_cb_set, 0);
+                nfsd4_mark_cb_down(clp, task->tk_status);
-                warn_no_callback_path(clp, task->tk_status);
-                if (current_rpc_client != task->tk_client) {
-                        /* queue a callback on the new connection: */
-                        atomic_inc(&dp->dl_count);
-                        nfsd4_cb_recall(dp);
-                        return;
-                }
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
                task->tk_status = 0;
                rpc_restart_call_prepare(task);
                return;
-        } else {
-                atomic_set(&clp->cl_cb_set, 0);
-                warn_no_callback_path(clp, task->tk_status);
        }
+        nfsd4_mark_cb_down(clp, task->tk_status);
+        cb->cb_done = true;
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
+        struct nfs4_client *clp = cb->cb_clp;
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        nfs4_put_delegation(dp);
+        if (cb->cb_done) {
+                spin_lock(&clp->cl_lock);
+                list_del(&cb->cb_per_client);
+                spin_unlock(&clp->cl_lock);
+                nfs4_put_delegation(dp);
+        }
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
        flush_workqueue(callback_wq);
 }
-void nfsd4_release_cb(struct nfsd4_callback *cb)
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
        if (cb->cb_ops->rpc_release)
                cb->cb_ops->rpc_release(cb);
 }
-void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+        struct nfsd4_session *s;
+        struct nfsd4_conn *c;
+        list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+                list_for_each_entry(c, &s->se_conns, cn_persession) {
+                        if (c->cn_flags & NFS4_CDFC4_BACK)
+                                return c;
+                }
+        }
+        return NULL;
+}
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 {
        struct nfs4_cb_conn conn;
        struct nfs4_client *clp = cb->cb_clp;
+        struct nfsd4_session *ses = NULL;
+        struct nfsd4_conn *c;
        int err;
        /*
@@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
                rpc_shutdown_client(clp->cl_cb_client);
                clp->cl_cb_client = NULL;
        }
+        if (clp->cl_cb_conn.cb_xprt) {
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+                clp->cl_cb_conn.cb_xprt = NULL;
+        }
        if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
                return;
        spin_lock(&clp->cl_lock);
@@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
        BUG_ON(!clp->cl_cb_flags);
        clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
        memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+        c = __nfsd4_find_backchannel(clp);
+        if (c) {
+                svc_xprt_get(c->cn_xprt);
+                conn.cb_xprt = c->cn_xprt;
+                ses = c->cn_session;
+        }
        spin_unlock(&clp->cl_lock);
-        err = setup_callback_client(clp, &conn);
+        err = setup_callback_client(clp, &conn, ses);
-        if (err)
+        if (err) {
                warn_no_callback_path(clp, err);
+                return;
+        }
+        /* Yay, the callback channel's back! Restart any callbacks: */
+        list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+                run_nfsd4_cb(cb);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfsd4_callback *cb = &dp->dl_recall;
+        struct nfs4_client *clp = dp->dl_client;
        dp->dl_retries = 1;
        cb->cb_op = dp;
-        cb->cb_clp = dp->dl_client;
+        cb->cb_clp = clp;
        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
        cb->cb_msg.rpc_argp = cb;
        cb->cb_msg.rpc_resp = cb;
@@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
        cb->cb_ops = &nfsd4_cb_recall_ops;
        dp->dl_retries = 1;
-        queue_work(callback_wq, &dp->dl_recall.cb_work);
+        INIT_LIST_HEAD(&cb->cb_per_client);
+        cb->cb_done = true;
+        run_nfsd4_cb(&dp->dl_recall);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0..6d2c397d458 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
 */
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
 /*
 * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
        return clp->name;
 }
-static int
+static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
                uid_t *id)
 {
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
        int ret;
        if (namelen + 1 > sizeof(key.name))
-                return -EINVAL;
+                return nfserr_badowner;
        memcpy(key.name, name, namelen);
        key.name[namelen] = '\0';
        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
        if (ret == -ENOENT)
-                ret = -ESRCH; /* nfserr_badname */
+                return nfserr_badowner;
        if (ret)
-                return ret;
+                return nfserrno(ret);
        *id = item->id;
        cache_put(&item->h, &nametoid_cache);
        return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
        return ret;
 }
-int
+__be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
        return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
 }
-int
+__be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7..db52546143d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status;
 }
-static __be32
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-              void *arg)
 {
        struct svc_fh tmp_fh;
        __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        ret = exp_pseudoroot(rqstp, &tmp_fh);
        if (ret)
                return ret;
-        if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+        if (tmp_fh.fh_dentry == fh->fh_dentry) {
                fh_put(&tmp_fh);
                return nfserr_noent;
        }
        fh_put(&tmp_fh);
-        return nfsd_lookup(rqstp, &cstate->current_fh,
+        return nfsd_lookup(rqstp, fh, "..", 2, fh);
-                           "..", 2, &cstate->current_fh);
+}
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              void *arg)
+{
+        return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
 }
 static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        } else
                secinfo->si_exp = exp;
        dput(dentry);
+        if (cstate->minorversion)
+                /* See rfc 5661 section 2.6.3.1.1.8 */
+                fh_put(&cstate->current_fh);
        return err;
 }
 static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              struct nfsd4_secinfo_no_name *sin)
+{
+        __be32 err;
+        switch (sin->sin_style) {
+        case NFS4_SECINFO_STYLE4_CURRENT_FH:
+                break;
+        case NFS4_SECINFO_STYLE4_PARENT:
+                err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+                if (err)
+                        return err;
+                break;
+        default:
+                return nfserr_inval;
+        }
+        exp_get(cstate->current_fh.fh_export);
+        sin->sin_exp = cstate->current_fh.fh_export;
+        fh_put(&cstate->current_fh);
+        return nfs_ok;
+}
+static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
              struct nfsd4_setattr *setattr)
 {
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
 * Also note, enforced elsewhere:
 *      - SEQUENCE other than as first op results in
 *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound.
- *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *        (Enforced in nfsd4_bind_conn_to_session().)
 *      - DESTROY_SESSION must be the final operation in a compound, if
 *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
 *        (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
-        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
-                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
-                status = nfserr_jukebox;
-        }
        resp->cstate.status = status;
        fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_EXCHANGE_ID",
        },
+        [OP_BIND_CONN_TO_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_BIND_CONN_TO_SESSION",
+        },
        [OP_CREATE_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_create_session,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_RECLAIM_COMPLETE",
        },
+        [OP_SECINFO_NO_NAME] = {
+                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+                .op_name = "OP_SECINFO_NO_NAME",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a2..ffb59ef6f82 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        /* note: we currently use this path only for minorversion 0 */
        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1e5ec6b510..d98d0213285 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        nfs4_file_get_access(fp, O_RDONLY);
+        dp->dl_vfs_file = find_readable_file(fp);
+        get_file(dp->dl_vfs_file);
        dp->dl_flock = NULL;
        dp->dl_type = type;
        dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
+                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        struct file *filp = find_readable_file(dp->dl_file);
        dprintk("NFSD: close_delegation dp %p\n",dp);
+        /* XXX: do we even need this check?: */
        if (dp->dl_flock)
-                vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
+                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
-        nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 /* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
                free_conn(c);
        }
        spin_unlock(&clp->cl_lock);
+        nfsd4_probe_callback(clp);
 }
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -673,27 +674,39 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
        spin_unlock(&clp->cl_lock);
 }
-static void nfsd4_register_conn(struct nfsd4_conn *conn)
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
 {
        conn->cn_xpt_user.callback = nfsd4_conn_lost;
-        register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
 {
        struct nfsd4_conn *conn;
-        u32 flags = NFS4_CDFC4_FORE;
+        int ret;
-        if (ses->se_flags & SESSION4_BACK_CHAN)
+        conn = alloc_conn(rqstp, dir);
-                flags |= NFS4_CDFC4_BACK;
-        conn = alloc_conn(rqstp, flags);
        if (!conn)
                return nfserr_jukebox;
        nfsd4_hash_conn(conn, ses);
-        nfsd4_register_conn(conn);
+        ret = nfsd4_register_conn(conn);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&conn->cn_xpt_user);
        return nfs_ok;
 }
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+        u32 dir = NFS4_CDFC4_FORE;
+        if (ses->se_flags & SESSION4_BACK_CHAN)
+                dir |= NFS4_CDFC4_BACK;
+        return nfsd4_new_conn(rqstp, ses, dir);
+}
+/* must be called under client_lock */
 static void nfsd4_del_conns(struct nfsd4_session *s)
 {
        struct nfs4_client *clp = s->se_client;
@@ -745,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
         */
        slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
        numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+        if (numslots < 1)
+                return NULL;
        new = alloc_session(slotsize, numslots);
        if (!new) {
@@ -765,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
        idx = hash_sessionid(&new->se_sessionid);
        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        spin_lock(&clp->cl_lock);
        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&clp->cl_lock);
        spin_unlock(&client_lock);
-        status = nfsd4_new_conn(rqstp, new);
+        status = nfsd4_new_conn_from_crses(rqstp, new);
        /* whoops: benny points out, status is ignored! (err, or bogus) */
        if (status) {
                free_session(&new->se_ref);
                return NULL;
        }
-        if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
+                /*
-                clp->cl_cb_session = new;
+                 * This is a little silly; with sessions there's no real
-                clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+                 * use for the callback address.  Use the peer address
-                svc_xprt_get(rqstp->rq_xprt);
+                 * as a reasonable default for now, but consider fixing
+                 * the rpc client not to require an address in the
+                 * future:
+                 */
                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-                nfsd4_probe_callback(clp);
        }
+        nfsd4_probe_callback(clp);
        return new;
 }
@@ -813,7 +833,9 @@ static void
 unhash_session(struct nfsd4_session *ses)
 {
        list_del(&ses->se_hash);
+        spin_lock(&ses->se_client->cl_lock);
        list_del(&ses->se_perclnt);
+        spin_unlock(&ses->se_client->cl_lock);
 }
 /* must be called under the client_lock */
@@ -919,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
        mark_client_expired(clp);
        list_del(&clp->cl_lru);
+        spin_lock(&clp->cl_lock);
        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
                list_del_init(&ses->se_hash);
+        spin_unlock(&clp->cl_lock);
 }
 static void
@@ -1047,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_set, 0);
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
+        INIT_LIST_HEAD(&clp->cl_callbacks);
        spin_lock_init(&clp->cl_lock);
        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
        clp->cl_time = get_seconds();
@@ -1128,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
-/*
+static bool clp_used_exchangeid(struct nfs4_client *clp)
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
 {
-        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return clp->cl_exchange_flags != 0;
-        return use_exchange_id == has_exchange_flags;
+} 
-}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
-                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
-                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+        switch (family) {
+        case AF_INET:
+                ((struct sockaddr_in *)sa)->sin_family = AF_INET;
+                ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+                return;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+                ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+                return;
+        }
+}
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
        struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+        struct sockaddr *sa = svc_addr(rqstp);
+        u32 scopeid = rpc_get_scope_id(sa);
        unsigned short expected_family;
        /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1201,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
        conn->cb_prog = se->se_callback_prog;
        conn->cb_ident = se->se_callback_ident;
+        rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
        return;
 out_err:
        conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1340,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        case SP4_NONE:
                break;
        case SP4_SSV:
-                return nfserr_encr_alg_unsupp;
+                return nfserr_serverfault;
        default:
                BUG();                          /* checked by xdr code */
        case SP4_MACH_CRED:
@@ -1357,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        nfs4_lock_state();
        status = nfs_ok;
-        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
+                if (!clp_used_exchangeid(conf)) {
+                        status = nfserr_clid_inuse; /* XXX: ? */
+                        goto out;
+                }
                if (!same_verf(&verf, &conf->cl_verifier)) {
                        /* 18.35.4 case 8 */
                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1399,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                goto out;
        }
-        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval);
        if (unconf) {
                /*
                 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1556,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        status = nfs_ok;
        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
+        memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+                sizeof(struct nfsd4_channel_attrs));
        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
@@ -1577,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
        return argp->opcnt == resp->opcnt;
 }
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+        switch (*dir) {
+        case NFS4_CDFC4_FORE:
+        case NFS4_CDFC4_BACK:
+                return nfs_ok;
+        case NFS4_CDFC4_FORE_OR_BOTH:
+        case NFS4_CDFC4_BACK_OR_BOTH:
+                *dir = NFS4_CDFC4_BOTH;
+                return nfs_ok;
+        };
+        return nfserr_inval;
+}
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 status;
+        if (!nfsd4_last_compound_op(rqstp))
+                return nfserr_not_only_op;
+        spin_lock(&client_lock);
+        cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+        /* Sorta weird: we only need the refcnt'ing because new_conn acquires
+         * client_lock iself: */
+        if (cstate->session) {
+                nfsd4_get_session(cstate->session);
+                atomic_inc(&cstate->session->se_client->cl_refcount);
+        }
+        spin_unlock(&client_lock);
+        if (!cstate->session)
+                return nfserr_badsession;
+        status = nfsd4_map_bcts_dir(&bcts->dir);
+        nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+        return nfs_ok;
+}
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
        if (!session)
@@ -1615,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
        spin_unlock(&client_lock);
        nfs4_lock_state();
-        /* wait for callbacks */
+        nfsd4_probe_callback_sync(ses->se_client);
-        nfsd4_shutdown_callback(ses->se_client);
        nfs4_unlock_state();
        nfsd4_del_conns(ses);
@@ -1644,6 +1715,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
 {
        struct nfs4_client *clp = ses->se_client;
        struct nfsd4_conn *c;
+        int ret;
        spin_lock(&clp->cl_lock);
        c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1726,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
        }
        __nfsd4_hash_conn(new, ses);
        spin_unlock(&clp->cl_lock);
-        nfsd4_register_conn(new);
+        ret = nfsd4_register_conn(new);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&new->cn_xpt_user);
        return;
 }
@@ -1725,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 out:
        /* Hold a session reference until done processing the compound. */
        if (cstate->session) {
+                struct nfs4_client *clp = session->se_client;
                nfsd4_get_session(cstate->session);
-                atomic_inc(&session->se_client->cl_refcount);
+                atomic_inc(&clp->cl_refcount);
+                if (clp->cl_cb_state == NFSD4_CB_DOWN)
+                        seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
        }
        kfree(conn);
        spin_unlock(&client_lock);
@@ -1767,7 +1846,6 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
-        struct sockaddr         *sa = svc_addr(rqstp);
        struct xdr_netobj       clname = { 
                .len = setclid->se_namelen,
                .data = setclid->se_name,
@@ -1793,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval, false);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
+                if (clp_used_exchangeid(conf))
+                        goto out;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
                        char addr_str[INET6_ADDRSTRLEN];
                        rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1811,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -1868,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * for consistent minorversion use throughout:
         */
        new->cl_minorversion = 0;
-        gen_callback(new, setclid, rpc_get_scope_id(sa));
+        gen_callback(new, setclid, rqstp);
        add_to_unconfirmed(new, strhashval);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
        setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1927,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        atomic_set(&conf->cl_cb_set, 0);
                        nfsd4_change_callback(conf, &unconf->cl_cb_conn);
                        nfsd4_probe_callback(conf);
                        expire_client(unconf);
@@ -1956,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                            hash, false);
+                                                            hash);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
@@ -2254,7 +2333,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
- * Called from break_lease() with lock_kernel() held.
+ * Called from break_lease() with lock_flocks() held.
 * Note: we assume break_lease will only call this *once* for any given
 * lease.
 */
@@ -2278,7 +2357,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
        spin_unlock(&recall_lock);
-        /* only place dl_time is set. protected by lock_kernel*/
+        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
        /*
@@ -2292,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        nfsd4_cb_recall(dp);
 }
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_kernel() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-        if (!(fl->fl_flags & FL_LEASE) || !dp)
-                return;
-        dp->dl_flock = NULL;
-}
-/*
- * Called from setlease() with lock_kernel() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-        struct nfs4_delegation *onlistd =
-                (struct nfs4_delegation *)onlist->fl_owner;
-        struct nfs4_delegation *tryd =
-                (struct nfs4_delegation *)try->fl_owner;
-        if (onlist->fl_lmops != try->fl_lmops)
-                return 0;
-        return onlistd->dl_client == tryd->dl_client;
-}
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2338,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
        .fl_break = nfsd_break_deleg_cb,
-        .fl_release_private = nfsd_release_deleg_cb,
-        .fl_mylease = nfsd_same_client_deleg_cb,
        .fl_change = nfsd_change_deleg_cb,
 };
@@ -2506,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
        if (!fp->fi_fds[oflag]) {
                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
                        &fp->fi_fds[oflag]);
-                if (status == nfserr_dropit)
-                        status = nfserr_jukebox;
                if (status)
                        return status;
        }
@@ -2588,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
        open->op_stateowner->so_client->cl_firststate = 1;
 }
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+        if (clp->cl_cb_state == NFSD4_CB_UP)
+                return true;
+        /*
+         * In the sessions case, since we don't have to establish a
+         * separate connection for callbacks, we assume it's OK
+         * until we hear otherwise:
+         */
+        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2596,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+        int cb_up;
        struct file_lock *fl;
        int status, flag = 0;
+        cb_up = nfsd4_cb_channel_good(sop->so_client);
        flag = NFS4_OPEN_DELEGATE_NONE;
        open->op_recall = 0;
        switch (open->op_claim_type) {
@@ -2647,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        dp->dl_flock = fl;
        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callbacks fl_mylease and fl_change are used
+         * the lock_manager callback fl_change is used
         */
        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2786,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_set))
+                        && clp->cl_cb_state != NFSD4_CB_UP)
                goto out;
        status = nfs_ok;
 out:
@@ -3073,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                renew_client(dp->dl_client);
-                if (filpp)
+                if (filpp) {
                        *filpp = find_readable_file(dp->dl_file);
-                BUG_ON(!*filpp);
+                        BUG_ON(!*filpp);
+                }
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
                if (!stp)
@@ -4099,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+        clp = find_confirmed_client_by_str(name, strhashval);
        return clp ? 1 : 0;
 }
@@ -4328,7 +4383,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-        cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+        cancel_delayed_work_sync(&laundromat_work);
        destroy_workqueue(laundry_wq);
        locks_end_grace(&nfsd4_manager);
        nfs4_lock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a0402..956629b9cdc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        len += XDR_QUADLEN(dummy32) << 2;
                        READMEM(buf, dummy32);
                        ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
-                        host_err = 0;
+                        status = nfs_ok;
                        if (ace->whotype != NFS4_ACL_WHO_NAMED)
                                ace->who = 0;
                        else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-                                host_err = nfsd_map_name_to_gid(argp->rqstp,
+                                status = nfsd_map_name_to_gid(argp->rqstp,
                                                buf, dummy32, &ace->who);
                        else
-                                host_err = nfsd_map_name_to_uid(argp->rqstp,
+                                status = nfsd_map_name_to_uid(argp->rqstp,
                                                buf, dummy32, &ace->who);
-                        if (host_err)
+                        if (status)
-                                goto out_nfserr;
+                                return status;
                }
        } else
                *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+        COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(bcts->dir);
+        /* XXX: Perhaps Tom Tucker could help us figure out how we
+         * should be using ctsa_use_conn_in_rdma_mode: */
+        READ32(dummy);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 }
 static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+                     struct nfsd4_secinfo_no_name *sin)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(sin->sin_style);
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
        __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                         struct nfsd4_exchange_id *exid)
 {
-        int dummy;
+        int dummy, tmp;
        DECODE_HEAD;
        READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                /* ssp_hash_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_encr_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_window and ssp_num_gss_handles */
                READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        /* new operations for NFSv4.1 */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        case nfserr_resource:
                nfserr = nfserr_toosmall;
                goto fail;
-        case nfserr_dropit:
-                goto fail;
        case nfserr_noent:
                goto skip_entry;
        default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
        return nfserr;
 }
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 *p;
+        if (!nfserr) {
+                RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+                WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+                WRITE32(bcts->dir);
+                /* XXX: ? */
+                WRITE32(0);
+                ADJUST_ARGS();
+        }
+        return nfserr;
+}
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
-                     struct nfsd4_secinfo *secinfo)
+                         __be32 nfserr,struct svc_export *exp)
 {
        int i = 0;
-        struct svc_export *exp = secinfo->si_exp;
        u32 nflavs;
        struct exp_flavor_info *flavs;
        struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
        return nfserr;
 }
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo_no_name *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
 /*
 * The SETATTR encode routine is special -- it always encodes a bitmap,
 * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
        WRITE32(seq->seqid);
        WRITE32(seq->slotid);
        WRITE32(seq->maxslots);
-        /*
+        /* For now: target_maxslots = maxslots */
-         * FIXME: for now:
-         *   target_maxslots = maxslots
-         *   status_flags = 0
-         */
        WRITE32(seq->maxslots);
-        WRITE32(0);
+        WRITE32(seq->status_flags);
        ADJUST_ARGS();
        resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        /* NFSv4.1 operations */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d..33b3e2b0677 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+#ifdef CONFIG_NFSD_DEPRECATED
        static int warned;
        if (file->f_dentry->d_name.name[0] == '.' && !warned) {
                printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
                       current->comm, file->f_dentry->d_name.name);
                warned = 1;
        }
+#endif
        if (! file->private_data) {
                /* An attempt to read a transaction file without writing
                 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19..7ecfa242030 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
 #define nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_badowner         cpu_to_be32(NFSERR_BADOWNER)
 #define nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
 #define nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
 #define nfserr_grace            cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784..e15dc45fc5e 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
                { nfserr_stale, -ESTALE },
                { nfserr_jukebox, -ETIMEDOUT },
                { nfserr_jukebox, -ERESTARTSYS },
-                { nfserr_dropit, -EAGAIN },
+                { nfserr_jukebox, -EAGAIN },
-                { nfserr_dropit, -ENOMEM },
+                { nfserr_jukebox, -EWOULDBLOCK },
-                { nfserr_badname, -ESRCH },
+                { nfserr_jukebox, -ENOMEM },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f..18743c4d8bc 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-        if (nfserr == nfserr_dropit) {
+        if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
                dprintk("nfsd: Dropping request; may be revisited later\n");
                nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
                return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b068..3074656ba7b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
 struct nfsd4_callback {
        void *cb_op;
        struct nfs4_client *cb_clp;
+        struct list_head cb_per_client;
        u32 cb_minorversion;
        struct rpc_message cb_msg;
        const struct rpc_call_ops *cb_ops;
        struct work_struct cb_work;
+        bool cb_done;
 };
 struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
+        struct file             *dl_vfs_file;
        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
 struct nfs4_cb_conn {
        /* SETCLIENTID info */
        struct sockaddr_storage cb_addr;
+        struct sockaddr_storage cb_saddr;
        size_t                  cb_addrlen;
        u32                     cb_prog; /* used only in 4.0 case;
                                            per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
        u32                             gid;
 };
+struct nfsd4_bind_conn_to_session {
+        struct nfs4_sessionid           sessionid;
+        u32                             dir;
+};
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
        u32                             sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
        unsigned long           cl_cb_flags;
        struct rpc_clnt         *cl_cb_client;
        u32                     cl_cb_ident;
-        atomic_t                cl_cb_set;
+#define NFSD4_CB_UP             0
+#define NFSD4_CB_UNKNOWN        1
+#define NFSD4_CB_DOWN           2
+        int                     cl_cb_state;
        struct nfsd4_callback   cl_cb_null;
        struct nfsd4_session    *cl_cb_session;
+        struct list_head        cl_callbacks; /* list of in-progress callbacks */
        /* for all client information that callback code might need: */
        spinlock_t              cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff0..641117f2188 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
 #endif /* CONFIG_NFSD_V3 */
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
+#include "acl.h"
-#include <linux/nfsd_idmap.h>
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 #include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                            .dentry = dget(dentry)};
        int err = 0;
-        while (d_mountpoint(path.dentry) && follow_down(&path))
+        err = follow_down(&path, false);
-                ;
+        if (err < 0)
+                goto out;
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
        return err;
 }
+static int nfsd_break_lease(struct inode *inode)
+{
+        if (!S_ISREG(inode->i_mode))
+                return 0;
+        return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
 /*
 * Commit metadata changes to stable storage.
 */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                                goto out;
                }
-                /*
-                 * If we are changing the size of the file, then
-                 * we need to break all leases.
-                 */
-                host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-                if (host_err == -EWOULDBLOCK)
-                        host_err = -ETIMEDOUT;
-                if (host_err) /* ENOMEM or EWOULDBLOCK */
-                        goto out_nfserr;
                host_err = get_write_access(inode);
                if (host_err)
                        goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        err = nfserr_notsync;
        if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+                host_err = nfsd_break_lease(inode);
+                if (host_err)
+                        goto out_nfserr;
                fh_lock(fhp);
                host_err = notify_change(dentry, iap);
                err = nfserrno(host_err);
                fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         */
        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
-        if (host_err == -EWOULDBLOCK)
-                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
        struct page *page = buf->page;
        size_t size;
-        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        size = sd->len;
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
        return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-        return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-        return 0;
-#endif
-}
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        inode = file->f_path.dentry->d_inode;
-        if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-                goto out;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
                        .len            = 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                fsnotify_access(file);
        } else 
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        int                     stable = *stablep;
        int                     use_wgather;
-#ifdef MSNFS
-        err = nfserr_perm;
-        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-                goto out;
-#endif
        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        exp   = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
                err = 0;
        else
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                err = nfserrno(host_err);
                goto out_dput;
        }
+        err = nfserr_noent;
+        if (!dold->d_inode)
+                goto out_drop_write;
+        host_err = nfsd_break_lease(dold->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
                err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                else
                        err = nfserrno(host_err);
        }
+out_drop_write:
        mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
        dput(dnew);
@@ -1755,13 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ndentry == trap)
                goto out_dput_new;
-        if (svc_msnfs(ffhp) &&
-                ((atomic_read(&odentry->d_count) > 1)
-                 || (atomic_read(&ndentry->d_count) > 1))) {
-                        host_err = -EPERM;
-                        goto out_dput_new;
-        }
        host_err = -EXDEV;
        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
                goto out_dput_new;
@@ -1769,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (host_err)
                goto out_dput_new;
+        host_err = nfsd_break_lease(odentry->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
                if (!host_err)
                        host_err = commit_metadata(ffhp);
        }
+out_drop_write:
        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
 out_dput_new:
        dput(ndentry);
 out_dput_old:
@@ -1840,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (host_err)
                goto out_nfserr;
-        if (type != S_IFDIR) { /* It's UNLINK */
+        host_err = nfsd_break_lease(rdentry->d_inode);
-#ifdef MSNFS
+        if (host_err)
-                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+                goto out_put;
-                        (atomic_read(&rdentry->d_count) > 1)) {
+        if (type != S_IFDIR)
-                        host_err = -EPERM;
-                } else
-#endif
                host_err = vfs_unlink(dirp, rdentry);
-        } else { /* It's RMDIR */
+        else
                host_err = vfs_rmdir(dirp, rdentry);
-        }
+out_put:
        dput(rdentry);
        if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae..366401e1a53 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
        struct svc_export *si_exp;                      /* response */
 };
+struct nfsd4_secinfo_no_name {
+        u32 sin_style;                                  /* request */
+        struct svc_export *sin_exp;                     /* response */
+};
 struct nfsd4_setattr {
        stateid_t       sa_stateid;         /* request */
        u32             sa_bmval[3];        /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
        u32                     cachethis;              /* request */
 #if 0
        u32                     target_maxslots;        /* response */
-        u32                     status_flags;           /* response */
 #endif /* not yet */
+        u32                     status_flags;           /* response */
 };
 struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
                /* NFSv4.1 */
                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_bind_conn_to_session bind_conn_to_session;
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
@@ -484,18 +490,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        BUG_ON(!fhp->fh_pre_saved);
-        cinfo->atomic = 1;
+        cinfo->atomic = fhp->fh_post_saved;
        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
-        if (cinfo->change_supported) {
-                cinfo->before_change = fhp->fh_pre_change;
+        cinfo->before_change = fhp->fh_pre_change;
-                cinfo->after_change = fhp->fh_post_change;
+        cinfo->after_change = fhp->fh_post_change;
-        } else {
+        cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+        cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+        cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+        cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-        }
 }
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
@@ -519,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062ba..3ee67c67cc5 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+                                     const char *fname, int err)
+{
+        struct inode *inode = bmap->b_inode;
+        if (err == -EINVAL) {
+                nilfs_error(inode->i_sb, fname,
+                            "broken bmap (inode number=%lu)\n", inode->i_ino);
+                err = -EIO;
+        }
+        return err;
 }
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-        if (ret < 0)
+        if (ret < 0) {
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
                goto out;
+        }
        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-        if (!ret)
-                *key = lastkey;
        up_read(&bmap->b_sem);
+        if (ret < 0)
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+        else
+                *key = lastkey;
        return ret;
 }
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb74..388e9e8f528 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat =
+                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
-                                nilfs_dat_inode(NILFS_I_NILFS(inode));
                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33..59e5fe742f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
         * the device at this point.
         *
         * To prevent nilfs_dat_translate() from returning the
-         * uncommited block number, this makes a copy of the entry
+         * uncommitted block number, this makes a copy of the entry
         * buffer and redirects nilfs_dat_translate() to the copy.
         */
        if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f..9d45773b79e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
-        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
        loff_t pos = page_offset(page) + from;
        unsigned len = to - from;
        unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
                i_size_write(dir, pos + copied);
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
-        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        err = nilfs_set_file_dirty(dir, nr_dirty);
        WARN_ON(err); /* do not happen */
        unlock_page(page);
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6f..2f560c9fb80 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 /* end of file */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c..caf9a6a3fb5 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 int nilfs_init_gcinode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
-        /*
-         * Add the inode to GC inode list. Garbage Collection
-         * is serialized and no two processes manipulate the
-         * list simultaneously.
-         */
-        igrab(inode);
-        list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
        return 0;
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f9..bfc73d3a30e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-        if (unlikely(err)) {
+        if (unlikely(err))
-                if (err == -EINVAL)
+                nilfs_warning(sb, __func__, "unable to read inode: %lu",
-                        nilfs_error(sb, __func__, "ifile is broken");
+                              (unsigned long) ino);
-                else
-                        nilfs_warning(sb, __func__,
-                                      "unable to read inode: %lu",
-                                      (unsigned long) ino);
-        }
        return err;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e..2fd440d8d6b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
        struct nilfs_inode_info *ii = NILFS_I(inode);
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
        down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                                       inode->i_ino,
                                       (unsigned long long)blkoff);
                                err = 0;
-                        } else if (err == -EINVAL) {
-                                nilfs_error(inode->i_sb, __func__,
-                                            "broken bmap (inode=%lu)\n",
-                                            inode->i_ino);
-                                err = -EIO;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
                                                      to proper value */
        } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
        if (ret) {
                struct inode *inode = page->mapping->host;
-                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
-                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
                                                  start + copied);
        copied = generic_write_end(file, mapping, pos, len, copied, page,
                                   fsdata);
-        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
 }
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
-        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        }
        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        return 0;
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        brelse(bh);
 bad_inode:
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
 }
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
- repeat:
+repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;
- failed:
+failed:
-        if (ret == -EINVAL)
+        nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                      "failed to truncate bmap (ino=%lu, err=%d)",
-                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+                      ii->vfs_inode.i_ino, ret);
-        else
-                nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                              "failed to truncate bmap (ino=%lu, err=%d)",
-                              ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_mark_inode_dirty(inode);
-        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
           But truncate has no return value. */
@@ -785,20 +775,24 @@ out_err:
        return err;
 }
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        struct nilfs_root *root;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        root = NILFS_I(inode)->i_root;
        if ((mask & MAY_WRITE) && root &&
            root->cno != NILFS_CPTREE_CURRENT_CNO)
                return -EROFS; /* snapshot is not writable */
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
-                           struct buffer_head **pbh)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
@@ -839,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
        return ret;
 }
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
-                         unsigned nr_dirty)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -874,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct buffer_head *ibh;
        int err;
-        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warning(inode->i_sb, __func__,
                              "failed to reget inode block.\n");
@@ -920,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len)
+{
+        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        __u64 logical = 0, phys = 0, size = 0;
+        __u32 flags = 0;
+        loff_t isize;
+        sector_t blkoff, end_blkoff;
+        sector_t delalloc_blkoff;
+        unsigned long delalloc_blklen;
+        unsigned int blkbits = inode->i_blkbits;
+        int ret, n;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        blkoff = start >> blkbits;
+        end_blkoff = (start + len - 1) >> blkbits;
+        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+                                                        &delalloc_blkoff);
+        do {
+                __u64 blkphy;
+                unsigned int maxblocks;
+                if (delalloc_blklen && blkoff == delalloc_blkoff) {
+                        if (size) {
+                                /* End of the current extent */
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                        }
+                        if (blkoff > end_blkoff)
+                                break;
+                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+                        logical = blkoff << blkbits;
+                        phys = 0;
+                        size = delalloc_blklen << blkbits;
+                        blkoff = delalloc_blkoff + delalloc_blklen;
+                        delalloc_blklen = nilfs_find_uncommitted_extent(
+                                inode, blkoff, &delalloc_blkoff);
+                        continue;
+                }
+                /*
+                 * Limit the number of blocks that we look up so as
+                 * not to get into the next delayed allocation extent.
+                 */
+                maxblocks = INT_MAX;
+                if (delalloc_blklen)
+                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+                                          maxblocks);
+                blkphy = 0;
+                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                n = nilfs_bmap_lookup_contig(
+                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                if (n < 0) {
+                        int past_eof;
+                        if (unlikely(n != -ENOENT))
+                                break; /* error */
+                        /* HOLE */
+                        blkoff++;
+                        past_eof = ((blkoff << blkbits) >= isize);
+                        if (size) {
+                                /* End of the current extent */
+                                if (past_eof)
+                                        flags |= FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                                size = 0;
+                        }
+                        if (blkoff > end_blkoff || past_eof)
+                                break;
+                } else {
+                        if (size) {
+                                if (phys && blkphy << blkbits == phys + size) {
+                                        /* The current extent goes on */
+                                        size += n << blkbits;
+                                } else {
+                                        /* Terminate the current extent */
+                                        ret = fiemap_fill_next_extent(
+                                                fieinfo, logical, phys, size,
+                                                flags);
+                                        if (ret || blkoff > end_blkoff)
+                                                break;
+                                        /* Start another extent */
+                                        flags = FIEMAP_EXTENT_MERGED;
+                                        logical = blkoff << blkbits;
+                                        phys = blkphy << blkbits;
+                                        size = n << blkbits;
+                                }
+                        } else {
+                                /* Start a new extent */
+                                flags = FIEMAP_EXTENT_MERGED;
+                                logical = blkoff << blkbits;
+                                phys = blkphy << blkbits;
+                                size = n << blkbits;
+                        }
+                        blkoff += n;
+                }
+                cond_resched();
+        } while (true);
+        /* If ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bf..496738963fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -242,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -337,6 +336,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -349,10 +349,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                ino = vdesc->vd_ino;
                cno = vdesc->vd_cno;
                inode = nilfs_iget_for_gc(sb, ino, cno);
-                if (unlikely(inode == NULL)) {
+                if (IS_ERR(inode)) {
-                        ret = -ENOMEM;
+                        ret = PTR_ERR(inode);
                        goto failed;
                }
+                if (list_empty(&NILFS_I(inode)->i_dirty)) {
+                        /*
+                         * Add the inode to GC inode list. Garbage Collection
+                         * is serialized and no two processes manipulate the
+                         * list simultaneously.
+                         */
+                        igrab(inode);
+                        list_add(&NILFS_I(inode)->i_dirty,
+                                 &nilfs->ns_gc_inodes);
+                }
                do {
                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
                                                           &buffers);
@@ -409,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        size_t nmembs = argv->v_nmembs;
        int ret;
-        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
        return (ret < 0) ? ret : nmembs;
 }
@@ -418,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -438,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                        /* skip dead block */
                        continue;
                if (bdescs[i].bd_level == 0) {
-                        ret = nilfs_mdt_mark_block_dirty(dat,
+                        ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
                                                         bdescs[i].bd_offset);
                        if (ret < 0) {
                                WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9..6a0e2a189f6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 *
 * %-ENOENT - the specified block does not exist (hole block)
 *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
 * %-EROFS - Read only filesystem (for create mode)
 */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 * %-ENOMEM - Insufficient memory available.
 *
 * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 * %-EIO - I/O error
 *
 * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
        struct buffer_head *bh_frozen;
        struct page *page;
        int blkbits = inode->i_blkbits;
-        int ret = -ENOMEM;
        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
        if (!page)
-                return ret;
+                return -ENOMEM;
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << blkbits, 0);
        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
-        if (bh_frozen) {
-                if (!buffer_uptodate(bh_frozen))
+        if (!buffer_uptodate(bh_frozen))
-                        nilfs_copy_buffer(bh_frozen, bh);
+                nilfs_copy_buffer(bh_frozen, bh);
-                if (list_empty(&bh_frozen->b_assoc_buffers)) {
+        if (list_empty(&bh_frozen->b_assoc_buffers)) {
-                        list_add_tail(&bh_frozen->b_assoc_buffers,
+                list_add_tail(&bh_frozen->b_assoc_buffers,
-                                      &shadow->frozen_buffers);
+                              &shadow->frozen_buffers);
-                        set_buffer_nilfs_redirected(bh);
+                set_buffer_nilfs_redirected(bh);
-                } else {
+        } else {
-                        brelse(bh_frozen); /* already frozen */
+                brelse(bh_frozen); /* already frozen */
-                }
-                ret = 0;
        }
        unlock_page(page);
        page_cache_release(page);
-        return ret;
+        return 0;
 }
 struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf16..98034271cd0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
        .rename         = nilfs_rename,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a56..777e8fd0430 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-        return nilfs->ns_dat;
-}
 /*
 * function prototype
 */
@@ -256,14 +251,14 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
-                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-                                unsigned);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len);
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f..0c432416cfe 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        }
        return nc;
 }
- 
 void nilfs_mapping_init_once(struct address_space *mapping)
 {
        memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff)
+{
+        unsigned int i;
+        pgoff_t index;
+        unsigned int nblocks_in_page;
+        unsigned long length = 0;
+        sector_t b;
+        struct pagevec pvec;
+        struct page *page;
+        if (inode->i_mapping->nrpages == 0)
+                return 0;
+        index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+repeat:
+        pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+                                        pvec.pages);
+        if (pvec.nr == 0)
+                return length;
+        if (length > 0 && pvec.pages[0]->index > index)
+                goto out;
+        b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        i = 0;
+        do {
+                page = pvec.pages[i];
+                lock_page(page);
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (b < start_blk)
+                                        continue;
+                                if (buffer_delay(bh)) {
+                                        if (length == 0)
+                                                *blkoff = b;
+                                        length++;
+                                } else if (length > 0) {
+                                        goto out_locked;
+                                }
+                        } while (++b, bh = bh->b_this_page, bh != head);
+                } else {
+                        if (length > 0)
+                                goto out_locked;
+                        b += nblocks_in_page;
+                }
+                unlock_page(page);
+        } while (++i < pagevec_count(&pvec));
+        index = page->index + 1;
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+out_locked:
+        unlock_page(page);
+out:
+        pagevec_release(&pvec);
+        return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a203..622df27cd89 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff);
 #define NILFS_PAGE_BUG(page, m, a...) \
        do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da..3dfcd3b7d38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err))
                        goto failed_page;
-                err = nilfs_set_file_dirty(sbi, inode, 1);
+                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_page;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b98..7a17715f215 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-        unsigned long mount_opt;
-        __u64 snapshot_cno;
-};
 struct the_nilfs;
 struct nilfs_sc_info;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea3..55ebae5c7f3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
        return err;
 }
-static int nilfs_handle_bmap_error(int err, const char *fname,
-                                   struct inode *inode, struct super_block *sb)
-{
-        if (err == -EINVAL) {
-                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-                            inode->i_ino);
-                err = -EIO;
-        }
-        return err;
-}
 /*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
 {
-        int err;
+        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
-                return nilfs_handle_bmap_error(err, __func__, inode,
-                                               sci->sc_super);
-        return 0;
 }
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
-        if (ret || nilfs_doing_gc())
+        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
-                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                ret++;
-                        ret++;
        return ret;
 }
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+        nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+        nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
                                 NILFS_SR_DAT_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_DAT:
 dat_stage:
-                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        return 0;
 failed_bmap:
-        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
        return err;
 }
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
                                if (!err) {
                                        set_buffer_uptodate(bh);
                                        clear_buffer_dirty(bh);
+                                        clear_buffer_delay(bh);
                                        clear_buffer_nilfs_volatile(bh);
                                }
                                brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
+                        clear_buffer_delay(bh);
                        clear_buffer_nilfs_volatile(bh);
                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d..58fd707174e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -162,10 +170,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        return &ii->vfs_inode;
 }
-void nilfs_destroy_inode(struct inode *inode)
+static void nilfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        if (mdi) {
                kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
                kfree(mdi);
@@ -173,6 +184,11 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
+void nilfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
@@ -688,7 +704,8 @@ skip_mount_setup:
        sbp[0]->s_state =
                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
        /* synchronize sbp[1] with sbp[0] */
-        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        if (sbp[1])
+                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
@@ -838,7 +855,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-        return atomic_read(&root_dentry->d_count) > 1;
+        return root_dentry->d_count > 1;
 }
 /**
@@ -1002,11 +1019,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
-        struct nilfs_mount_options old_opts;
+        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_opts.mount_opt = sbi->s_mount_opt;
+        old_mount_opt = sbi->s_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1075,7 +1092,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_mount_opt = old_mount_opt;
        return err;
 }
@@ -1147,14 +1164,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        struct dentry *root_dentry;
        int err, s_new = false;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return ERR_CAST(sd.bdev);
@@ -1233,7 +1250,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
        }
        if (!s_new)
-                close_bdev_exclusive(sd.bdev, mode);
+                blkdev_put(sd.bdev, mode);
        return root_dentry;
@@ -1242,7 +1259,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 failed:
        if (!s_new)
-                close_bdev_exclusive(sd.bdev, mode);
+                blkdev_put(sd.bdev, mode);
        return ERR_PTR(err);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c..ad4ac607cf5 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        printk(KERN_INFO "NILFS: recovery complete.\n");
 skip_recovery:
-        set_nilfs_loaded(nilfs);
        nilfs_clear_recovery_info(&ri);
        sbi->s_super->s_flags = s_flags;
        return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b74..fd85e4c05c6 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
 /* the_nilfs struct */
 enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
-        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b..7dceff005a6 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
        ---help---
           Say Y here to enable fanotify suport.  fanotify is a file access
           notification system which differs from inotify in that it sends
-           and open file descriptor to the userspace listener along with
+           an open file descriptor to the userspace listener along with
           the event.
           If unsure, say Y.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09..f35794b97e8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response);
+        wait_event(group->fanotify_data.access_waitq, event->response ||
+                                atomic_read(&group->fanotify_data.bypass_perm));
+        if (!event->response) /* bypass_perm set */
+                return 0;
        /* userspace responded, convert to something usable */
        spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7..8b61220cffc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
        return client_fd;
 }
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
                                   struct fanotify_event_metadata *metadata,
                                   struct fsnotify_event *event)
 {
+        int ret = 0;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
                 group, metadata, event);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
+        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        metadata->fd = create_fd(group, event);
+        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+                metadata->fd = FAN_NOFD;
+        else {
+                metadata->fd = create_fd(group, event);
+                if (metadata->fd < 0)
+                        ret = metadata->fd;
+        }
-        return metadata->fd;
+        return ret;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        mutex_lock(&group->fanotify_data.access_mutex);
-        if (group->fanotify_data.bypass_perm) {
+        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
                event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+        ret = fill_event_metadata(group, &fanotify_event_metadata, event);
-        if (fd < 0)
+        if (ret < 0)
-                return fd;
+                goto out;
+        fd = fanotify_event_metadata.fd;
        ret = prepare_for_access_response(group, event, fd);
        if (ret)
                goto out_close_fd;
        ret = -EFAULT;
-        if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+        if (copy_to_user(buf, &fanotify_event_metadata,
+                         fanotify_event_metadata.event_len))
                goto out_kill_access_response;
-        return FAN_EVENT_METADATA_LEN;
+        return fanotify_event_metadata.event_len;
 out_kill_access_response:
        remove_access_response(group, event, fd);
 out_close_fd:
-        sys_close(fd);
+        if (fd != FAN_NOFD)
+                sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (event->mask & FAN_ALL_PERM_EVENTS) {
+                event->response = FAN_DENY;
+                wake_up(&group->fanotify_data.access_waitq);
+        }
+#endif
        return ret;
 }
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        mutex_lock(&group->fanotify_data.access_mutex);
-        group->fanotify_data.bypass_perm = true;
+        atomic_inc(&group->fanotify_data.bypass_perm);
        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~mnt->mnt_fsnotify_mask)
                fsnotify_recalc_vfsmount_mask(mnt);
+err:
-        return 0;
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~inode->i_fsnotify_mask)
                fsnotify_recalc_inode_mask(inode);
-        return 0;
+err:
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 /* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
        group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
-        if (IS_ERR(group))
+        if (IS_ERR(group)) {
+                free_uid(user);
                return PTR_ERR(group);
+        }
        group->fanotify_data.user = user;
        atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
+        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
        switch (flags & FAN_ALL_CLASS_BITS) {
        case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        if (flags & ~FAN_ALL_MARK_FLAGS)
                return -EINVAL;
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
-        case FAN_MARK_ADD:
+        case FAN_MARK_ADD:              /* fallthrough */
        case FAN_MARK_REMOVE:
+                if (!mask)
+                        return -EINVAL;
        case FAN_MARK_FLUSH:
                break;
        default:
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 20dc218707c..79b47cbb5cd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,19 +68,21 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
+                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
                        if (!child->d_inode)
                                continue;
-                        spin_lock(&child->d_lock);
+                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
+                spin_unlock(&alias->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468..4cd5d5d78f9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (ret >= 0)
                return ret;
+        fsnotify_put_group(group);
        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be99254..4ff028fcfd6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
             unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a..f4b1057abdd 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * when atomic and when not atomic.  This is ok because
+ * atomic and when not atomic.  This is ok because it calls
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * and it is ok to call this when non-atomic.
+ * fact, the only difference between __copy_from_user_inatomic() and
- * Infact, the only difference between __copy_from_user_inatomic() and
 * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
+ * should not zero the tail of the buffer on error.  And on many architectures
- * architectures __copy_from_user_inatomic() is just defined to
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * makes no difference at all on those architectures.
 */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        *iov, *iov_ofs, len);
+                                        ofs, *iov, *iov_ofs, len);
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(addr + ofs + copied, 0, len - copied);
-                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
+                        kunmap(*pages);
                }
                total += len;
+                ntfs_set_next_iovec(iov, iov_ofs, len);
                bytes -= len;
                if (!bytes)
                        break;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
                ofs = 0;
        } while (++pages < last_page);
 out:
        return total;
 err_out:
-        total += copied;
+        BUG_ON(copied > len);
        /* Zero the rest of the target like __copy_from_user(). */
+        memset(addr + ofs + copied, 0, len - copied);
+        kunmap(*pages);
+        total += copied;
+        ntfs_set_next_iovec(iov, iov_ofs, copied);
        while (++pages < last_page) {
                bytes -= len;
                if (!bytes)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc..a627ed82c0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
        return NULL;
 }
+static void ntfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
 void ntfs_destroy_big_inode(struct inode *inode)
 {
        ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
        BUG_ON(ni->page);
        if (!atomic_dec_and_test(&ni->count))
                BUG();
-        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+        call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b672718..326e7475a22 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
        flush_dcache_page(page);
        SetPageUptodate(page);
        if (base_ni) {
+                MFT_RECORD *m_tmp;
                /*
                 * Setup the base mft record in the extent mft record.  This
                 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
                 * attach it to the base inode @base_ni and map, pin, and lock
                 * its, i.e. the allocated, mft record.
                 */
-                m = map_extent_mft_record(base_ni, bit, &ni);
+                m_tmp = map_extent_mft_record(base_ni, bit, &ni);
-                if (IS_ERR(m)) {
+                if (IS_ERR(m_tmp)) {
                        ntfs_error(vol->sb, "Failed to map allocated extent "
                                        "mft record 0x%llx.", (long long)bit);
-                        err = PTR_ERR(m);
+                        err = PTR_ERR(m_tmp);
                        /* Set the mft record itself not in use. */
                        m->flags &= cpu_to_le16(
                                        ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
                        ntfs_unmap_page(page);
                        goto undo_mftbmp_alloc;
                }
+                BUG_ON(m != m_tmp);
                /*
                 * Make sure the allocated mft record is written out to disk.
                 * No need to set the inode dirty because the caller is going
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f..29099a07b9f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2001,2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
        ntfs_sysctl(0);
 }
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698..77a8de5f711 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
 config OCFS2_FS
        tristate "OCFS2 file system support"
-        depends on NET && SYSFS
+        depends on NET && SYSFS && CONFIGFS_FS
-        select CONFIGFS_FS
        select JBD2
        select CRC32
        select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 config OCFS2_FS_STATS
        bool "OCFS2 statistics"
-        depends on OCFS2_FS
+        depends on OCFS2_FS && DEBUG_FS
        default y
        help
          This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe..704f6b1742f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
        return ret;
 }
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
        int ret = -EAGAIN;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return ret;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f0585..4fe7c9cf4bf 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d..e4984e259cb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, tl_bh);
+        osb->truncated_clusters += num_clusters;
 bail:
        mlog_exit(status);
        return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                i--;
        }
+        osb->truncated_clusters = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc)
-{
-        int status;
-        unsigned int new_i_clusters;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct buffer_head *last_eb_bh = NULL;
-        mlog_entry_void();
-        *tc = NULL;
-        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-                                                  i_size_read(inode));
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)le64_to_cpu(fe->i_size));
-        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-        if (!(*tc)) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_extent_block(INODE_CACHE(inode),
-                                                 le64_to_cpu(fe->i_last_eb_blk),
-                                                 &last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        (*tc)->tc_last_eb_bh = last_eb_bh;
-        status = 0;
-bail:
-        if (status < 0) {
-                if (*tc)
-                        ocfs2_free_truncate_context(*tc);
-                *tc = NULL;
-        }
-        mlog_exit_void();
-        return status;
-}
-/*
 * 'start' is inclusive, 'end' is not.
 */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-        /*
-         * The caller is responsible for completing deallocation
-         * before freeing the context.
-         */
-        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                mlog(ML_NOTICE,
-                     "Truncate completion has non-empty dealloc context\n");
-        brelse(tc->tc_last_eb_bh);
-        kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b9..3bd08a03251 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
                          struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..1fbb0e20131 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (ocfs2_iocb_is_sem_locked(iocb)) {
+                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
-        if (!level)
-                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
        if (is_async)
@@ -1627,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+                                          unsigned int needed)
+{
+        tid_t target;
+        int ret = 0;
+        unsigned int truncated_clusters;
+        mutex_lock(&osb->osb_tl_inode->i_mutex);
+        truncated_clusters = osb->truncated_clusters;
+        mutex_unlock(&osb->osb_tl_inode->i_mutex);
+        /*
+         * Check whether we can succeed in allocating if we free
+         * the truncate log.
+         */
+        if (truncated_clusters < needed)
+                goto out;
+        ret = ocfs2_flush_truncate_log(osb);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+                jbd2_log_wait_commit(osb->journal->j_journal, target);
+                ret = 1;
+        }
+out:
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct file *filp,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
@@ -1634,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-        unsigned int clusters_to_alloc, extents_to_split;
+        unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1643,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int try_free = 1, ret1;
+try_again:
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1678,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
+                clusters_need = wc->w_clen;
                ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
@@ -1692,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        }
+        clusters_need += clusters_to_alloc;
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1814,6 +1858,22 @@ out:
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ret == -ENOSPC && try_free) {
+                /*
+                 * Try to free some truncate log so that we can have enough
+                 * clusters to allocate.
+                 */
+                try_free = 0;
+                ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+                if (ret1 == 1)
+                        goto try_again;
+                if (ret1 < 0)
+                        mlog_errno(ret1);
+        }
        return ret;
 }
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691..eceb456037c 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
        else
                clear_bit(1, (unsigned long *)&iocb->private);
 }
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+        OCFS2_IOCB_RW_LOCK = 0,
+        OCFS2_IOCB_RW_LOCK_LEVEL,
+        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_NUM_LOCKS
+};
 #define ocfs2_iocb_clear_rw_locked(iocb) \
-        clear_bit(0, (unsigned long *)&iocb->private)
+        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
-        test_bit(1, (unsigned long *)&iocb->private)
+        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e2..b108e863d8f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES   4
 #define O2HB_DB_TYPE_REGION_NUMBER      5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME        6
+#define O2HB_DB_TYPE_REGION_PINNED      7
 struct o2hb_debug_buf {
        int db_type;
        int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS        "failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER        "num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME  "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED        "pinned"
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF                3
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
 /* Only sets a new threshold if there are no active regions.
 *
 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
        struct list_head        hr_all_item;
-        unsigned                hr_unclean_stop:1;
+        unsigned                hr_unclean_stop:1,
+                                hr_item_pinned:1,
+                                hr_item_dropped:1;
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
        struct dentry           *hr_debug_livenodes;
        struct dentry           *hr_debug_regnum;
        struct dentry           *hr_debug_elapsed_time;
+        struct dentry           *hr_debug_pinned;
        struct o2hb_debug_buf   *hr_db_livenodes;
        struct o2hb_debug_buf   *hr_db_regnum;
        struct o2hb_debug_buf   *hr_db_elapsed_time;
+        struct o2hb_debug_buf   *hr_db_pinned;
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
@@ -307,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-        cancel_delayed_work(&reg->hr_write_timeout_work);
+        cancel_delayed_work_sync(&reg->hr_write_timeout_work);
-        flush_scheduled_work();
 }
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -702,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
               config_item_name(&reg->hr_item));
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        /*
+         * If global heartbeat active, unpin all regions if the
+         * region count > CUT_OFF
+         */
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+                o2hb_region_unpin(NULL);
 }
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1042,6 +1082,9 @@ static int o2hb_thread(void *data)
        set_user_nice(current, -20);
+        /* Pin node */
+        o2nm_depend_this_node();
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1091,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
+        /* Unpin node */
+        o2nm_undepend_this_node();
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
        return 0;
@@ -1143,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
                                                 reg->hr_last_timeout_start));
                goto done;
+        case O2HB_DB_TYPE_REGION_PINNED:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                !!reg->hr_item_pinned);
+                goto done;
        default:
                goto done;
        }
@@ -1316,6 +1368,8 @@ int o2hb_init(void)
        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+        o2hb_dependent_users = 0;
        return o2hb_debug_init();
 }
@@ -1385,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
+        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
        spin_lock(&o2hb_live_lock);
@@ -1674,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
        if (ret) {
                reg->hr_bdev = NULL;
                goto out;
@@ -1949,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                goto bail;
        }
+        reg->hr_debug_pinned =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_pinned),
+                                          sizeof(*(reg->hr_db_pinned)),
+                                          O2HB_DB_TYPE_REGION_PINNED,
+                                          0, 0, reg);
+        if (!reg->hr_debug_pinned) {
+                mlog_errno(ret);
+                goto bail;
+        }
        ret = 0;
 bail:
        return ret;
@@ -1964,8 +2031,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        if (reg == NULL)
                return ERR_PTR(-ENOMEM);
-        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
-                return ERR_PTR(-ENAMETOOLONG);
+                ret = -ENAMETOOLONG;
+                goto free;
+        }
        spin_lock(&o2hb_live_lock);
        reg->hr_region_num = 0;
@@ -1974,7 +2043,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
                                                         O2NM_MAX_REGIONS);
                if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
                        spin_unlock(&o2hb_live_lock);
-                        return ERR_PTR(-EFBIG);
+                        ret = -EFBIG;
+                        goto free;
                }
                set_bit(reg->hr_region_num, o2hb_region_bitmap);
        }
@@ -1986,10 +2056,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
        if (ret) {
                config_item_put(&reg->hr_item);
-                return ERR_PTR(ret);
+                goto free;
        }
        return &reg->hr_item;
+free:
+        kfree(reg);
+        return ERR_PTR(ret);
 }
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1997,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+        int quorum_region = 0;
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
        if (o2hb_global_heartbeat_active()) {
                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+        reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -2023,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (o2hb_global_heartbeat_active())
                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
                       config_item_name(&reg->hr_item));
        config_item_put(item);
+        if (!o2hb_global_heartbeat_active() || !quorum_region)
+                return;
+        /*
+         * If global heartbeat active and there are dependent users,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        spin_lock(&o2hb_live_lock);
+        if (!o2hb_dependent_users)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 struct o2hb_heartbeat_group_attribute {
@@ -2209,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-        struct o2hb_region *p, *reg = NULL;
+        int ret = 0, found = 0;
+        struct o2hb_region *reg;
+        char *uuid;
        assert_spin_locked(&o2hb_live_lock);
-        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                uuid = config_item_name(&reg->hr_item);
-                        reg = p;
-                        break;
+                /* local heartbeat */
+                if (region_uuid) {
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
                }
+                if (reg->hr_item_pinned || reg->hr_item_dropped)
+                        goto skip_pin;
+                /* Ignore ENOENT only for local hb (userdlm domain) */
+                ret = o2nm_depend_item(&reg->hr_item);
+                if (!ret) {
+                        mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                        reg->hr_item_pinned = 1;
+                } else {
+                        if (ret == -ENOENT && found)
+                                ret = 0;
+                        else {
+                                mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                     uuid, ret);
+                                break;
+                        }
+                }
+skip_pin:
+                if (found)
+                        break;
        }
-        return reg;
+        return ret;
 }
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-        int ret = 0;
        struct o2hb_region *reg;
+        char *uuid;
+        int found = 0;
-        spin_lock(&o2hb_live_lock);
+        assert_spin_locked(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-        if (!reg)
+                uuid = config_item_name(&reg->hr_item);
-                ret = -ENOENT;
+                if (region_uuid) {
-        spin_unlock(&o2hb_live_lock);
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
-        if (ret)
+                if (reg->hr_item_pinned) {
-                goto out;
+                        mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                        o2nm_undepend_item(&reg->hr_item);
+                        reg->hr_item_pinned = 0;
+                }
+                if (found)
+                        break;
+        }
+}
-        ret = o2nm_depend_this_node();
+static int o2hb_region_inc_user(const char *region_uuid)
-        if (ret)
+{
-                goto out;
+        int ret = 0;
-        ret = o2nm_depend_item(&reg->hr_item);
+        spin_lock(&o2hb_live_lock);
-        if (ret)
-                o2nm_undepend_this_node();
-out:
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            ret = o2hb_region_pin(region_uuid);
+            goto unlock;
+        }
+        /*
+         * if global heartbeat active and this is the first dependent user,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        o2hb_dependent_users++;
+        if (o2hb_dependent_users > 1)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                ret = o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
        return ret;
 }
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-        struct o2hb_region *reg;
        spin_lock(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            o2hb_region_unpin(region_uuid);
+            goto unlock;
+        }
-        spin_unlock(&o2hb_live_lock);
+        /*
+         * if global heartbeat active and there are no dependent users,
+         * unpin all quorum regions
+         */
+        o2hb_dependent_users--;
+        if (!o2hb_dependent_users)
+                o2hb_region_unpin(NULL);
-        if (reg) {
+unlock:
-                o2nm_undepend_item(&reg->hr_item);
+        spin_unlock(&o2hb_live_lock);
-                o2nm_undepend_this_node();
-        }
 }
 int o2hb_register_callback(const char *region_uuid,
@@ -2286,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
        if (region_uuid) {
-                ret = o2hb_region_get(region_uuid);
+                ret = o2hb_region_inc_user(region_uuid);
-                if (ret)
+                if (ret) {
+                        mlog_errno(ret);
                        goto out;
+                }
        }
        down_write(&o2hb_callback_sem);
@@ -2306,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-        mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -2317,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
-        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
        /* XXX Can this happen _with_ a region reference? */
@@ -2325,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
        if (region_uuid)
-                o2hb_region_put(region_uuid);
+                o2hb_region_dec_user(region_uuid);
        down_write(&o2hb_callback_sem);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392..6c61771469a 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUOTA),
        define_mask(REFCOUNT),
        define_mask(BASTS),
+        define_mask(RESERVATIONS),
+        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
-        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c9..34d6544357d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
 #include <linux/sched.h>
 /* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
 #define ML_ENTRY        0x0000000000000001ULL /* func call entry */
 #define ML_EXIT         0x0000000000000002ULL /* func call exit */
 #define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_NOTICE       0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER      0x0000001000000000ULL /* cluster stack */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b0..3a5835904b3 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR         "o2net"
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
+#define STATS_DEBUG_NAME        "stats"
+#define SHOW_SOCK_CONTAINERS    0
+#define SHOW_SOCK_STATS         1
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        ktime_t now;
+        s64 sock, send, status;
        spin_lock(&o2net_debug_lock);
        nst = next_nst(dummy_nst);
+        if (!nst)
+                goto out;
-        if (nst != NULL) {
+        now = ktime_get();
-                /* get_task_comm isn't exported.  oh well. */
+        sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-                seq_printf(seq, "%p:\n"
+        send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-                           "  pid:          %lu\n"
+        status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-                           "  tgid:         %lu\n"
-                           "  process name: %s\n"
+        /* get_task_comm isn't exported.  oh well. */
-                           "  node:         %u\n"
+        seq_printf(seq, "%p:\n"
-                           "  sc:           %p\n"
+                   "  pid:          %lu\n"
-                           "  message id:   %d\n"
+                   "  tgid:         %lu\n"
-                           "  message type: %u\n"
+                   "  process name: %s\n"
-                           "  message key:  0x%08x\n"
+                   "  node:         %u\n"
-                           "  sock acquiry: %lu.%ld\n"
+                   "  sc:           %p\n"
-                           "  send start:   %lu.%ld\n"
+                   "  message id:   %d\n"
-                           "  wait start:   %lu.%ld\n",
+                   "  message type: %u\n"
-                           nst, (unsigned long)nst->st_task->pid,
+                   "  message key:  0x%08x\n"
-                           (unsigned long)nst->st_task->tgid,
+                   "  sock acquiry: %lld usecs ago\n"
-                           nst->st_task->comm, nst->st_node,
+                   "  send start:   %lld usecs ago\n"
-                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                   "  wait start:   %lld usecs ago\n",
-                           nst->st_msg_key,
+                   nst, (unsigned long)task_pid_nr(nst->st_task),
-                           nst->st_sock_time.tv_sec,
+                   (unsigned long)nst->st_task->tgid,
-                           (long)nst->st_sock_time.tv_usec,
+                   nst->st_task->comm, nst->st_node,
-                           nst->st_send_time.tv_sec,
+                   nst->st_sc, nst->st_id, nst->st_msg_type,
-                           (long)nst->st_send_time.tv_usec,
+                   nst->st_msg_key,
-                           nst->st_status_time.tv_sec,
+                   (long long)sock,
-                           (long)nst->st_status_time.tv_usec);
+                   (long long)send,
-        }
+                   (long long)status);
+out:
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
        spin_unlock(&o2net_debug_lock);
 }
+struct o2net_sock_debug {
+        int dbg_ctxt;
+        struct o2net_sock_container *dbg_sock;
+};
 static struct o2net_sock_container
                        *next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)              ((_s)->sc_send_count)
+# define sc_recv_count(_s)              ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)        (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)      (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)              (0U)
+# define sc_recv_count(_s)              (0U)
+# define sc_tv_acquiry_total_ns(_s)     (0LL)
+# define sc_tv_send_total_ns(_s)        (0LL)
+# define sc_tv_status_total_ns(_s)      (0LL)
+# define sc_tv_process_total_ns(_s)     (0LL)
+#endif
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION         1
+static void sc_show_sock_stats(struct seq_file *seq,
+                               struct o2net_sock_container *sc)
+{
+        if (!sc)
+                return;
+        seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+                   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+                   (long long)sc_tv_acquiry_total_ns(sc),
+                   (long long)sc_tv_send_total_ns(sc),
+                   (long long)sc_tv_status_total_ns(sc),
+                   (unsigned long)sc_recv_count(sc),
+                   (long long)sc_tv_process_total_ns(sc));
+}
+static void sc_show_sock_container(struct seq_file *seq,
+                                   struct o2net_sock_container *sc)
+{
+        struct inet_sock *inet = NULL;
+        __be32 saddr = 0, daddr = 0;
+        __be16 sport = 0, dport = 0;
+        if (!sc)
+                return;
+        if (sc->sc_sock) {
+                inet = inet_sk(sc->sc_sock->sk);
+                /* the stack's structs aren't sparse endian clean */
+                saddr = (__force __be32)inet->inet_saddr;
+                daddr = (__force __be32)inet->inet_daddr;
+                sport = (__force __be16)inet->inet_sport;
+                dport = (__force __be16)inet->inet_dport;
+        }
+        /* XXX sigh, inet-> doesn't have sparse annotation so any
+         * use of it here generates a warning with -Wbitwise */
+        seq_printf(seq, "%p:\n"
+                   "  krefs:           %d\n"
+                   "  sock:            %pI4:%u -> "
+                                      "%pI4:%u\n"
+                   "  remote node:     %s\n"
+                   "  page off:        %zu\n"
+                   "  handshake ok:    %u\n"
+                   "  timer:           %lld usecs\n"
+                   "  data ready:      %lld usecs\n"
+                   "  advance start:   %lld usecs\n"
+                   "  advance stop:    %lld usecs\n"
+                   "  func start:      %lld usecs\n"
+                   "  func stop:       %lld usecs\n"
+                   "  func key:        0x%08x\n"
+                   "  func type:       %u\n",
+                   sc,
+                   atomic_read(&sc->sc_kref.refcount),
+                   &saddr, inet ? ntohs(sport) : 0,
+                   &daddr, inet ? ntohs(dport) : 0,
+                   sc->sc_node->nd_name,
+                   sc->sc_page_off,
+                   sc->sc_handshake_ok,
+                   (long long)ktime_to_us(sc->sc_tv_timer),
+                   (long long)ktime_to_us(sc->sc_tv_data_ready),
+                   (long long)ktime_to_us(sc->sc_tv_advance_start),
+                   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+                   (long long)ktime_to_us(sc->sc_tv_func_start),
+                   (long long)ktime_to_us(sc->sc_tv_func_stop),
+                   sc->sc_msg_key,
+                   sc->sc_msg_type);
+}
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
-        if (sc != NULL) {
+        if (sc) {
-                struct inet_sock *inet = NULL;
+                if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+                        sc_show_sock_container(seq, sc);
-                __be32 saddr = 0, daddr = 0;
+                else
-                __be16 sport = 0, dport = 0;
+                        sc_show_sock_stats(seq, sc);
-                if (sc->sc_sock) {
-                        inet = inet_sk(sc->sc_sock->sk);
-                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->inet_dport;
-                }
-                /* XXX sigh, inet-> doesn't have sparse annotation so any
-                 * use of it here generates a warning with -Wbitwise */
-                seq_printf(seq, "%p:\n"
-                           "  krefs:           %d\n"
-                           "  sock:            %pI4:%u -> "
-                                              "%pI4:%u\n"
-                           "  remote node:     %s\n"
-                           "  page off:        %zu\n"
-                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%ld\n"
-                           "  func key:        %u\n"
-                           "  func type:       %u\n",
-                           sc,
-                           atomic_read(&sc->sc_kref.refcount),
-                           &saddr, inet ? ntohs(sport) : 0,
-                           &daddr, inet ? ntohs(dport) : 0,
-                           sc->sc_node->nd_name,
-                           sc->sc_page_off,
-                           sc->sc_handshake_ok,
-                           TV_SEC_USEC(sc->sc_tv_timer),
-                           TV_SEC_USEC(sc->sc_tv_data_ready),
-                           TV_SEC_USEC(sc->sc_tv_advance_start),
-                           TV_SEC_USEC(sc->sc_tv_advance_stop),
-                           TV_SEC_USEC(sc->sc_tv_func_start),
-                           TV_SEC_USEC(sc->sc_tv_func_stop),
-                           sc->sc_msg_key,
-                           sc->sc_msg_type);
        }
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
        struct o2net_sock_container *dummy_sc;
        struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
                goto out;
        seq = file->private_data;
-        seq->private = dummy_sc;
+        seq->private = sd;
+        sd->dbg_sock = dummy_sc;
        o2net_debug_add_sc(dummy_sc);
        dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
-        struct o2net_sock_container *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *dummy_sc = sd->dbg_sock;
        o2net_debug_del_sc(dummy_sc);
        return seq_release_private(inode, file);
 }
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_STATS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
+static const struct file_operations stats_seq_fops = {
+        .open = stats_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
 static const struct file_operations sc_seq_fops = {
        .open = sc_fop_open,
        .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
                goto bail;
        }
+        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                           o2net_dentry, NULL,
+                                           &stats_seq_fops);
+        if (!stats_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
        return 0;
 bail:
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
        return -ENOMEM;
 }
 void o2net_debugfs_exit(void)
 {
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e1669621..a87366750f2 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
 void o2quo_exit(void)
 {
-        flush_scheduled_work();
+        struct o2quo_state *qs = &o2quo_state;
+        flush_work_sync(&qs->qs_work);
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e4212..3b11cb1e38f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_sock_time);
+        nst->st_sock_time = ktime_get();
 }
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_send_time);
+        nst->st_send_time = ktime_get();
 }
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_status_time);
+        nst->st_status_time = ktime_get();
 }
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                         struct o2net_sock_container *sc)
+                                                struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
 {
        nst->st_id = msg_id;
 }
-#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
 {
+        sc->sc_tv_timer = ktime_get();
 }
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_data_ready = ktime_get();
 }
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_start = ktime_get();
 }
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_stop = ktime_get();
 }
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
-                                                struct o2net_sock_container *sc)
 {
+        sc->sc_tv_func_start = ktime_get();
 }
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
-                                        u32 msg_id)
 {
+        sc->sc_tv_func_stop = ktime_get();
 }
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+                                    struct o2net_sock_container *sc)
+{
+        sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+                                           ktime_sub(ktime_get(),
+                                                     nst->st_status_time));
+        sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+                                         ktime_sub(nst->st_status_time,
+                                                   nst->st_send_time));
+        sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+                                            ktime_sub(nst->st_send_time,
+                                                      nst->st_sock_time));
+        sc->sc_send_count++;
+}
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+        sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+                                            o2net_get_func_run_time(sc));
+        sc->sc_recv_count++;
+}
+#else
+# define o2net_update_send_stats(a, b)
+# define o2net_update_recv_stats(sc)
+#endif /* CONFIG_OCFS2_FS_STATS */
 static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
                sc->sc_sock = NULL;
        }
+        o2nm_undepend_item(&sc->sc_node->nd_item);
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
        struct o2net_sock_container *sc, *ret = NULL;
        struct page *page = NULL;
+        int status = 0;
        page = alloc_page(GFP_NOFS);
        sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        o2nm_node_get(node);
        sc->sc_node = node;
+        /* pin the node item of the remote node */
+        status = o2nm_depend_item(&node->nd_item);
+        if (status) {
+                mlog_errno(status);
+                o2nm_node_put(node);
+                goto out;
+        }
        INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
        INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        if (sk->sk_user_data) {
                struct o2net_sock_container *sc = sk->sk_user_data;
                sclog(sc, "data_ready hit\n");
-                do_gettimeofday(&sc->sc_tv_data_ready);
+                o2net_set_data_ready_time(sc);
                o2net_sc_queue_work(sc, &sc->sc_rx_work);
                ready = sc->sc_data_ready;
        } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+        o2net_update_send_stats(&nst, sc);
        /* Note that we avoid overwriting the callers status return
         * variable if a system error was reported on the other
         * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        if (syserr != O2NET_ERR_NONE)
                goto out_respond;
-        do_gettimeofday(&sc->sc_tv_func_start);
+        o2net_set_func_start_time(sc);
        sc->sc_msg_key = be32_to_cpu(hdr->key);
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
                                        nmh->nh_func_data, &ret_data);
-        do_gettimeofday(&sc->sc_tv_func_stop);
+        o2net_set_func_stop_time(sc);
+        o2net_update_recv_stats(sc);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
        size_t datalen;
        sclog(sc, "receiving\n");
-        do_gettimeofday(&sc->sc_tv_advance_start);
+        o2net_set_advance_start_time(sc);
        if (unlikely(sc->sc_handshake_ok == 0)) {
                if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 out:
        sclog(sc, "ret = %d\n", ret);
-        do_gettimeofday(&sc->sc_tv_advance_stop);
+        o2net_set_advance_stop_time(sc);
        return ret;
 }
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-        struct timeval now;
-        do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+        ktime_t now = ktime_get();
+#endif
        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
-        mlog(ML_NOTICE, "here are some times that might help debug the "
-             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+#ifdef CONFIG_DEBUG_FS
-             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             now.tv_sec, (long) now.tv_usec,
+             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             sc->sc_tv_advance_start.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long) sc->sc_tv_advance_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             sc->sc_tv_advance_stop.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             (long) sc->sc_tv_advance_stop.tv_usec,
             sc->sc_msg_key, sc->sc_msg_type,
-             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_func_start),
-             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+             (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
                      msecs_to_jiffies(o2net_keepalive_delay()));
-        do_gettimeofday(&sc->sc_tv_timer);
+        o2net_set_sock_timer(sc);
        mod_timer(&sc->sc_idle_timeout,
               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4..4cbcb65784a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-        struct list_head        sc_net_debug_item;
-#endif
-        struct timeval          sc_tv_timer;
-        struct timeval          sc_tv_data_ready;
-        struct timeval          sc_tv_advance_start;
-        struct timeval          sc_tv_advance_stop;
-        struct timeval          sc_tv_func_start;
-        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+        ktime_t                 sc_tv_timer;
+        ktime_t                 sc_tv_data_ready;
+        ktime_t                 sc_tv_advance_start;
+        ktime_t                 sc_tv_advance_stop;
+        ktime_t                 sc_tv_func_start;
+        ktime_t                 sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+        ktime_t                 sc_tv_acquiry_total;
+        ktime_t                 sc_tv_send_total;
+        ktime_t                 sc_tv_status_total;
+        u32                     sc_send_count;
+        u32                     sc_recv_count;
+        ktime_t                 sc_tv_process_total;
+#endif
        struct mutex            sc_send_lock;
 };
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
        u32                             st_msg_type;
        u32                             st_msg_key;
        u8                              st_node;
-        struct timeval                  st_sock_time;
+        ktime_t                         st_sock_time;
-        struct timeval                  st_send_time;
+        ktime_t                         st_send_time;
-        struct timeval                  st_status_time;
+        ktime_t                         st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e..6d80ecc7834 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int ret = 0;    /* if all else fails, just return false */
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        struct ocfs2_super *osb;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        osb = OCFS2_SB(dentry->d_sb);
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -169,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
        struct list_head *p;
        struct dentry *dentry = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each(p, &inode->i_dentry) {
                dentry = list_entry(p, struct dentry, d_alias);
+                spin_lock(&dentry->d_lock);
                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
                        mlog(0, "dentry found: %.*s\n",
                             dentry->d_name.len, dentry->d_name.name);
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
                        break;
                }
+                spin_unlock(&dentry->d_lock);
                dentry = NULL;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return dentry;
 }
@@ -476,7 +484,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 out:
        iput(inode);
-        ocfs2_dentry_attach_gen(dentry);
 }
 /*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7a..d417b3f9b0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        di->i_dx_root = cpu_to_le64(dr_blkno);
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                goto out_commit;
        }
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f4499915683..3a3ed4bb794 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
+        res = lock->lockres;
        assert_spin_locked(&dlm->ast_lock);
        if (!list_empty(&lock->ast_list)) {
-                mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+                mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+                     "AST list not empty, pending %d, newlevel %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
        if (lock->ast_pending)
-                mlog(0, "lock has an ast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        /* check to see if this ast obsoletes the bast */
        if (dlm_should_cancel_bast(dlm, lock)) {
-                struct dlm_lock_resource *res = lock->lockres;
+                mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-                mlog(0, "%s: cancelling bast for %.*s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
-                     dlm->name, res->lockname.len, res->lockname.name);
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
                lock->bast_pending = 0;
                list_del_init(&lock->bast_list);
                lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
        assert_spin_locked(&dlm->ast_lock);
+        res = lock->lockres;
        BUG_ON(!list_empty(&lock->bast_list));
        if (lock->bast_pending)
-                mlog(0, "lock has a bast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        dlm_astlockfunc_t *fn;
        struct dlm_lockstatus *lksb;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct dlm_lockstatus *lksb;
        int lksbflags;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
        dlm_bastlockfunc_t *fn = lock->bast;
-        mlog_entry_void();
        BUG_ON(lock->ml.node != dlm->node_num);
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             blocked_type);
        (*fn)(lock->astdata, blocked_type);
 }
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        /* cannot get a proxy ast message if this node owns it */
        BUG_ON(res->owner == dlm->node_num);
-        mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: Adding to granted list... type=%d, "
+                mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        size_t veclen = 1;
        int status;
-        mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+        mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-                   res->lockname.len, res->lockname.name, lock->ml.node,
+             res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-                   msg_type, blocked_type);
+             blocked_type);
        memset(&past, 0, sizeof(struct dlm_proxy_ast));
        past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        vec[0].iov_len = sizeof(struct dlm_proxy_ast);
        vec[0].iov_base = &past;
        if (flags & DLM_LKSB_GET_LVB) {
-                mlog(0, "returning requested LVB data\n");
                be32_add_cpu(&past.flags, LKM_GET_LVB);
                vec[1].iov_len = DLM_LVB_LEN;
                vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     dlm->name, res->lockname.len, res->lockname.name, ret,
                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5..4bdf7baee34 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_mle_type {
-        DLM_MLE_BLOCK,
+        DLM_MLE_BLOCK = 0,
-        DLM_MLE_MASTER,
+        DLM_MLE_MASTER = 1,
-        DLM_MLE_MIGRATION,
+        DLM_MLE_MIGRATION = 2,
-        DLM_MLE_NUM_TYPES
+        DLM_MLE_NUM_TYPES = 3,
 };
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 enum dlm_ast_type {
        DLM_AST = 0,
-        DLM_BAST,
+        DLM_BAST = 1,
-        DLM_ASTUNLOCK
+        DLM_ASTUNLOCK = 2,
 };
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 enum dlm_ctxt_state {
        DLM_CTXT_NEW = 0,
-        DLM_CTXT_JOINED,
+        DLM_CTXT_JOINED = 1,
-        DLM_CTXT_IN_SHUTDOWN,
+        DLM_CTXT_IN_SHUTDOWN = 2,
-        DLM_CTXT_LEAVING,
+        DLM_CTXT_LEAVING = 3,
 };
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 enum dlm_lockres_list {
        DLM_GRANTED_LIST = 0,
-        DLM_CONVERTING_LIST,
+        DLM_CONVERTING_LIST = 1,
-        DLM_BLOCKED_LIST
+        DLM_BLOCKED_LIST = 2,
 };
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 enum {
-        DLM_MASTER_REQUEST_MSG    = 500,
+        DLM_MASTER_REQUEST_MSG          = 500,
-        DLM_UNUSED_MSG1,         /* 501 */
+        DLM_UNUSED_MSG1                 = 501,
-        DLM_ASSERT_MASTER_MSG,   /* 502 */
+        DLM_ASSERT_MASTER_MSG           = 502,
-        DLM_CREATE_LOCK_MSG,     /* 503 */
+        DLM_CREATE_LOCK_MSG             = 503,
-        DLM_CONVERT_LOCK_MSG,    /* 504 */
+        DLM_CONVERT_LOCK_MSG            = 504,
-        DLM_PROXY_AST_MSG,       /* 505 */
+        DLM_PROXY_AST_MSG               = 505,
-        DLM_UNLOCK_LOCK_MSG,     /* 506 */
+        DLM_UNLOCK_LOCK_MSG             = 506,
-        DLM_DEREF_LOCKRES_MSG,   /* 507 */
+        DLM_DEREF_LOCKRES_MSG           = 507,
-        DLM_MIGRATE_REQUEST_MSG, /* 508 */
+        DLM_MIGRATE_REQUEST_MSG         = 508,
-        DLM_MIG_LOCKRES_MSG,     /* 509 */
+        DLM_MIG_LOCKRES_MSG             = 509,
-        DLM_QUERY_JOIN_MSG,      /* 510 */
+        DLM_QUERY_JOIN_MSG              = 510,
-        DLM_ASSERT_JOINED_MSG,   /* 511 */
+        DLM_ASSERT_JOINED_MSG           = 511,
-        DLM_CANCEL_JOIN_MSG,     /* 512 */
+        DLM_CANCEL_JOIN_MSG             = 512,
-        DLM_EXIT_DOMAIN_MSG,     /* 513 */
+        DLM_EXIT_DOMAIN_MSG             = 513,
-        DLM_MASTER_REQUERY_MSG,  /* 514 */
+        DLM_MASTER_REQUERY_MSG          = 514,
-        DLM_LOCK_REQUEST_MSG,    /* 515 */
+        DLM_LOCK_REQUEST_MSG            = 515,
-        DLM_RECO_DATA_DONE_MSG,  /* 516 */
+        DLM_RECO_DATA_DONE_MSG          = 516,
-        DLM_BEGIN_RECO_MSG,      /* 517 */
+        DLM_BEGIN_RECO_MSG              = 517,
-        DLM_FINALIZE_RECO_MSG,   /* 518 */
+        DLM_FINALIZE_RECO_MSG           = 518,
-        DLM_QUERY_REGION,        /* 519 */
+        DLM_QUERY_REGION                = 519,
-        DLM_QUERY_NODEINFO,      /* 520 */
+        DLM_QUERY_NODEINFO              = 520,
 };
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
        DLM_RECO_NODE_DATA_DEAD = -1,
        DLM_RECO_NODE_DATA_INIT = 0,
-        DLM_RECO_NODE_DATA_REQUESTING,
+        DLM_RECO_NODE_DATA_REQUESTING = 1,
-        DLM_RECO_NODE_DATA_REQUESTED,
+        DLM_RECO_NODE_DATA_REQUESTED = 2,
-        DLM_RECO_NODE_DATA_RECEIVING,
+        DLM_RECO_NODE_DATA_RECEIVING = 3,
-        DLM_RECO_NODE_DATA_DONE,
+        DLM_RECO_NODE_DATA_DONE = 4,
-        DLM_RECO_NODE_DATA_FINALIZE_SENT,
+        DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 enum {
        DLM_MASTER_RESP_NO = 0,
-        DLM_MASTER_RESP_YES,
+        DLM_MASTER_RESP_YES = 1,
-        DLM_MASTER_RESP_MAYBE,
+        DLM_MASTER_RESP_MAYBE = 2,
-        DLM_MASTER_RESP_ERROR
+        DLM_MASTER_RESP_ERROR = 3,
 };
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
        JOIN_DISALLOW = 0,
-        JOIN_OK,
+        JOIN_OK = 1,
-        JOIN_OK_NO_MAP,
+        JOIN_OK_NO_MAP = 2,
-        JOIN_PROTOCOL_MISMATCH,
+        JOIN_PROTOCOL_MISMATCH = 3,
 };
 struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a5..04a32be0aeb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
        kref_get(&dc->debug_refcnt);
 }
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = NULL;
+        free_page((unsigned long)file->private_data);
+        return 0;
-        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-        if (!db)
-                goto bail;
-        db->len = PAGE_SIZE;
-        db->buf = kmalloc(db->len, GFP_KERNEL);
-        if (!db->buf)
-                goto bail;
-        return db;
-bail:
-        kfree(db);
-        return NULL;
-}
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-                                 size_t nbytes, loff_t *ppos)
-{
-        struct debug_buffer *db = file->private_data;
-        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-        struct debug_buffer *db = file->private_data;
-        loff_t new = -1;
-        switch (whence) {
-        case 0:
-                new = off;
-                break;
-        case 1:
-                new = file->f_pos + off;
-                break;
-        }
-        if (new < 0 || new > db->len)
-                return -EINVAL;
-        return (file->f_pos = new);
 }
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+                          size_t nbytes, loff_t *ppos)
 {
-        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
-        if (db)
-                kfree(db->buf);
-        kfree(db);
-        return 0;
 }
 /* end - util funcs */
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_lock_resource *res;
        int out = 0;
        unsigned long total = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping Purgelist for Domain: %s\n", dlm->name);
        spin_lock(&dlm->spinlock);
        list_for_each_entry(res, &dlm->purge_list, purge) {
                ++total;
-                if (db->len - out < 100)
+                if (len - out < 100)
                        continue;
                spin_lock(&res->spinlock);
                out += stringify_lockname(res->lockname.name,
                                          res->lockname.len,
-                                          db->buf + out, db->len - out);
+                                          buf + out, len - out);
-                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                out += snprintf(buf + out, len - out, "\t%ld\n",
                                (jiffies - res->last_used)/HZ);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
-                        "Total on list: %ld\n", total);
        return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_purgelist_print(dlm, db);
+        i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -480,14 +434,14 @@ bail:
 static const struct file_operations debug_purgelist_fops = {
        .open =         debug_purgelist_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - purge list funcs */
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_master_list_entry *mle;
        struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int i, out = 0;
        unsigned long total = 0, longest = 0, bucket_count = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                          master_hash_node);
                        ++total;
                        ++bucket_count;
-                        if (db->len - out < 200)
+                        if (len - out < 200)
                                continue;
-                        out += dump_mle(mle, db->buf + out, db->len - out);
+                        out += dump_mle(mle, buf + out, len - out);
                }
                longest = max(longest, bucket_count);
                bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_mle_print(dlm, db);
+        i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 static const struct file_operations debug_mle_fops = {
        .open =         debug_mle_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        int out = 0;
        struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        }
        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
                        dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
                        dlm->dlm_locking_proto.pv_minor);
        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Thread Pid: %d  Node: %d  State: %s\n",
-                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+                        task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
        /* Number of Joins: xxx  Joining Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Number of Joins: %d  Joining Node: %d\n",
                        dlm->num_joins, dlm->joining_node);
        /* Domain Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += snprintf(buf + out, len - out, "Domain Map: ");
        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Lock Resources: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lock Resources: %d (%d)\n",
                        atomic_read(&dlm->res_cur_count),
                        atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
        /* MLEs: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
        /*  Blocking: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Blocking: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
        /*  Mastery: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Mastery: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
        /*  Migration: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Migration: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
                        atomic_read(&dlm->dlm_refs.refcount));
        /* Dead Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dead Node: %d\n", dlm->reco.dead_node);
        /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                state = "INACTIVE";
        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Recovery Pid: %d  Master: %d  State: %s\n",
-                        dlm->dlm_reco_thread_task->pid,
+                        task_pid_nr(dlm->dlm_reco_thread_task),
                        dlm->reco.new_master, state);
        /* Recovery Map: xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += snprintf(buf + out, len - out, "Recovery Map: ");
        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Recovery Node State: */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        out += snprintf(buf + out, len - out, "Recovery Node State:\n");
        list_for_each_entry(node, &dlm->reco.node_data, list) {
                switch (node->state) {
                case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        state = "BAD";
                        break;
                }
-                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                out += snprintf(buf + out, len - out, "\t%u - %s\n",
                                node->node_num, state);
        }
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db = NULL;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_state_print(dlm, db);
+        i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -936,9 +890,9 @@ bail:
 static const struct file_operations debug_state_fops = {
        .open =         debug_state_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end  - debug state funcs */
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
        if (dc) {
-                if (dc->debug_purgelist_dentry)
+                debugfs_remove(dc->debug_purgelist_dentry);
-                        debugfs_remove(dc->debug_purgelist_dentry);
+                debugfs_remove(dc->debug_mle_dentry);
-                if (dc->debug_mle_dentry)
+                debugfs_remove(dc->debug_lockres_dentry);
-                        debugfs_remove(dc->debug_mle_dentry);
+                debugfs_remove(dc->debug_state_dentry);
-                if (dc->debug_lockres_dentry)
-                        debugfs_remove(dc->debug_lockres_dentry);
-                if (dc->debug_state_dentry)
-                        debugfs_remove(dc->debug_state_dentry);
                dlm_debug_put(dc);
        }
 }
@@ -1040,8 +990,7 @@ bail:
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-        if (dlm->dlm_debugfs_subroot)
+        debugfs_remove(dlm->dlm_debugfs_subroot);
-                debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 /* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 void dlm_destroy_debugfs_root(void)
 {
-        if (dlm_debugfs_root)
+        debugfs_remove(dlm_debugfs_root);
-                debugfs_remove(dlm_debugfs_root);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
        struct dentry *debug_purgelist_dentry;
 };
-struct debug_buffer {
-        int len;
-        char *buf;
-};
 struct debug_lockres {
        int dl_len;
        char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b95373..7e38a072d72 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
-                mlog(0, "%s: touched %d lockreses in bucket %d "
-                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
        wake_up(&dlm->dlm_thread_wq);
@@ -959,7 +957,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
                r += O2HB_MAX_REGION_NAME_LEN;
        }
-        local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+        local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
        if (!local) {
                status = -ENOMEM;
                goto bail;
@@ -1661,8 +1659,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c..7009292aac5 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
                if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
                        return 0;
+                if (!dlm_lock_compatible(tmplock->ml.convert_type,
+                                         lock->ml.type))
+                        return 0;
        }
        return 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80..59f0f6bdfc6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res,
-                                      int *numlocks)
+                                      int *numlocks,
+                                      int *hasrefs)
 {
        int ret;
        int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
+        *numlocks = 0;
+        *hasrefs = 0;
        ret = -EINVAL;
        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        }
        *numlocks = count;
-        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (count < O2NM_MAX_NODES)
+                *hasrefs = 1;
+        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
 leave:
        return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
        /*
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
        if (ret < 0) {
                spin_unlock(&res->spinlock);
                goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (numlocks == 0) {
+        if (numlocks == 0 && !hasrefs)
-                mlog(0, "no locks were found on this lockres! done!\n");
                goto leave;
-        }
        /*
         * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * find a node to migrate the lockres to
         */
-        mlog(0, "picking a migration node\n");
        spin_lock(&dlm->spinlock);
        /* pick a new node */
        if (!test_bit(target, dlm->domain_map) ||
            target >= O2NM_MAX_NODES) {
                target = dlm_pick_migration_target(dlm, res);
        }
-        mlog(0, "node %u chosen for migration\n", target);
+        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+             namelen, name, target);
        if (target >= O2NM_MAX_NODES ||
            !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        }
        /* No need to migrate a lockres having no locks */
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0) {
+        if (ret >= 0 && numlocks == 0 && !hasrefs) {
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                }
                queue++;
        }
+        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (nodenum < O2NM_MAX_NODES) {
+                spin_unlock(&res->spinlock);
+                return nodenum;
+        }
        spin_unlock(&res->spinlock);
        mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9..1d6d1d22c47 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
+                        mlog(0, "%s: Adding res %.*s to purge list\n",
-                             res->lockname.len, res->lockname.name, res);
+                             dlm->name, res->lockname.len, res->lockname.name);
                        res->last_used = jiffies;
                        dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        dlm->purge_count++;
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+                mlog(0, "%s: Removing res %.*s from purge list\n",
-                     res->lockname.len, res->lockname.name, res, res->owner);
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        spin_lock(&dlm->spinlock);
        spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        master = (res->owner == dlm->node_num);
+        mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+             res->lockname.len, res->lockname.name, master);
-             res->lockname.name, master);
        if (!master) {
                res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
-                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
        }
        if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purgelist, "
+                mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-                     "master = %d\n", res->lockname.len, res->lockname.name,
+                     dlm->name, res->lockname.len, res->lockname.name, master);
-                     res, master);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
                dlm->purge_count--;
        }
        if (!__dlm_lockres_unused(res)) {
-                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                __dlm_print_one_lock_resource(res);
                BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-                        mlog(0, "lockres %s:%.*s: is in use or "
+                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "being remastered, used %d, state %d\n",
+                             "used %d, state %d\n", dlm->name,
-                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.len, lockres->lockname.name,
-                             lockres->lockname.name, !unused, lockres->state);
+                             !unused, lockres->state);
                        list_move_tail(&dlm->purge_list, &lockres->purge);
                        spin_unlock(&lockres->spinlock);
                        continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
        struct list_head *head;
        int can_grant = 1;
-        //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+        /*
-        //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+         * Because this function is called with the lockres
-        //mlog(0, "shuffle res %.*s\n", res->lockname.len,
-        //        res->lockname.name);
-        /* because this function is called with the lockres
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
-         * basts right before queueing them all throughout */
+         * basts right before queueing them all throughout
+         */
        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
        if (list_empty(&res->converting))
                goto blocked;
-        mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+        mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-             res->lockname.name);
+             res->lockname.len, res->lockname.name);
        target = list_entry(res->converting.next, struct dlm_lock, list);
        if (target->ml.convert_type == LKM_IVMODE) {
-                mlog(ML_ERROR, "%.*s: converting a lock with no "
+                mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-                     "convert_type!\n", res->lockname.len, res->lockname.name);
+                     dlm->name, res->lockname.len, res->lockname.name);
                BUG();
        }
        head = &res->granted;
@@ -365,9 +356,12 @@ converting:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+                mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-                     "granting: %d, node: %u\n", res->lockname.len,
+                     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-                     res->lockname.name, target->ml.type,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+                     target->ml.type,
                     target->ml.convert_type, target->ml.node);
                target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+                mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-                     "node: %u\n", res->lockname.len, res->lockname.name,
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
                     target->ml.type, target->ml.node);
-                // target->ml.type is already correct
+                /* target->ml.type is already correct */
                list_move_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        if (res) {
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                        res->state |= DLM_LOCK_RES_DIRTY;
                }
        }
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-        mlog(0, "starting dlm thread...\n");
+        mlog(0, "Starting dlm_thread...\n");
        dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
        if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
        if (dlm->dlm_thread_task) {
-                mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+                mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
                kthread_stop(dlm->dlm_thread_task);
                dlm->dlm_thread_task = NULL;
        }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* get an extra ref on lock */
                dlm_lock_get(lock);
                res = lock->lockres;
-                mlog(0, "delivering an ast for this lockres\n");
+                mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.node);
                BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another ast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->ast_list)) {
-                        mlog(0, "aha another ast got queued while "
+                        mlog(0, "%s: res %.*s, AST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the ast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                dlm_lock_put(lock);
                spin_unlock(&dlm->ast_lock);
-                mlog(0, "delivering a bast for this lockres "
+                mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-                     "(blocked = %d\n", hi);
+                     "blocked %d, node %u\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     hi, lock->ml.node);
                if (lock->ml.node != dlm->node_num) {
                        ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another bast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->bast_list)) {
-                        mlog(0, "aha another bast got queued while "
+                        mlog(0, "%s: res %.*s, BAST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the bast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
-                                mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+                                mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-                                     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+                                     " dirty %d\n", dlm->name,
-                                     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-                                     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_MIGRATING),
-                                     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+                                     !!(res->state & DLM_LOCK_RES_RECOVERING),
+                                     !!(res->state & DLM_LOCK_RES_DIRTY));
                        }
                        BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                spin_unlock(&dlm->ast_lock);
-                                mlog(0, "delaying list shuffling for in-"
+                                mlog(0, "%s: res %.*s, inprogress, delay list "
-                                     "progress lockres %.*s, state=%d\n",
+                                     "shuffle, state %d\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
                                     res->state);
                                delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%.*s\n", dlm->name,
-                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
                        /* unlikely, but we may need to give time to
                         * other tasks */
                        if (!--n) {
-                                mlog(0, "throttling dlm_thread\n");
+                                mlog(0, "%s: Throttling dlm thread\n",
+                                     dlm->name);
                                break;
                        }
                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19e..8c5c0eddc36 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
        return &ip->ip_vfs_inode;
 }
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
 static void dlmfs_evict_inode(struct inode *inode)
 {
        int status;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af..5dbc3062b4f 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
        }
        result = d_obtain_alias(inode);
-        if (!IS_ERR(result))
+        if (IS_ERR(result))
-                result->d_op = &ocfs2_dentry_ops;
-        else
                mlog_errno(PTR_ERR(result));
 bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
-        if (!IS_ERR(parent))
-                parent->d_op = &ocfs2_dentry_ops;
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a280..a6651956482 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
        return err;
 }
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, ocfs2_check_acl);
+        ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1986,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
                            loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_space_resv sr;
        int change_size = 1;
+        int cmd = OCFS2_IOC_RESVSP64;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        if (!ocfs2_writes_unwritten_extents(osb))
                return -EOPNOTSUPP;
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        if (mode & FALLOC_FL_KEEP_SIZE)
                change_size = 0;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                cmd = OCFS2_IOC_UNRESVSP64;
        sr.l_whence = 0;
        sr.l_start = (s64)offset;
        sr.l_len = (s64)len;
-        return __ocfs2_change_file_space(NULL, inode, offset,
+        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
-                                         OCFS2_IOC_RESVSP64, &sr, change_size);
+                                         change_size);
 }
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2241,11 +2248,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
+        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
        if (direct_io) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                /* communicate with ocfs2_dio_end_io */
+                ocfs2_iocb_set_sem_locked(iocb);
        }
        /*
@@ -2382,8 +2393,10 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 out_sems:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        mutex_unlock(&inode->i_mutex);
@@ -2527,6 +2540,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
+        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2549,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        if (filp->f_flags & O_DIRECT) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2575,8 +2591,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 bail:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
@@ -2592,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
-        .fallocate      = ocfs2_fallocate,
        .fiemap         = ocfs2_fiemap,
 };
@@ -2624,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+        .fallocate      = ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7..f5afbbef670 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600d..4068c6c4c6f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * #1 and #2 can be simply solved by never taking the lock
         * here for system files (which are the only type we read
         * during mount). It's a heavier approach, but our main
-         * concern is user-accesible files anyway.
+         * concern is user-accessible files anyway.
         *
         * #3 works itself out because we'll eventually take the
         * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ff5744e1e36..849fb4a2e81 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
-        dentry->d_op = &ocfs2_dentry_ops;
        ret = d_splice_alias(inode, dentry);
        if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
                mlog_errno(status);
                goto leave;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        ihold(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 out_commit:
@@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
-                if (oi1->ip_blkno != oi2->ip_blkno)
+                if (oi1->ip_blkno != oi2->ip_blkno) {
                        ocfs2_inode_unlock(inode2, 1);
+                        brelse(*bh2);
+                        *bh2 = NULL;
+                }
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
                mlog_errno(status);
                goto bail;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3b..51cd6898e7f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        unsigned char            l_level;
+        signed char              l_level;
+        signed char              l_requested;
+        signed char              l_blocking;
        /* Data packed - type enum ocfs2_lock_type */
        unsigned char            l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
        unsigned char            l_action;
        /* Data packed - enum type ocfs2_unlock_action */
        unsigned char            l_unlock_action;
-        unsigned char            l_requested;
-        unsigned char            l_blocking;
        unsigned int             l_pending_gen;
        spinlock_t               l_lock;
@@ -420,6 +420,11 @@ struct ocfs2_super
        struct inode                    *osb_tl_inode;
        struct buffer_head              *osb_tl_bh;
        struct delayed_work             osb_truncate_log_wq;
+        /*
+         * How many clusters in our truncate log.
+         * It must be protected by osb_tl_inode->i_mutex.
+         */
+        unsigned int truncated_clusters;
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2..bf2e7764920 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
 #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
-#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
 #define NUM_LOCAL_SYSTEM_INODES \
                (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f92..a5ebe421195 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
                        return c;
        }
-        return c;
+        return NULL;
 }
 /*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de763..71998d4d61d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
-                 * allocation group. This helps us mantain some
+                 * allocation group. This helps us maintain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
                                                min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef3157..38f986d2447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -570,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
                                                unsigned int cbits)
 {
@@ -987,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1007,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on       = ocfs2_quota_on,
+        .quota_on_meta  = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
        .quota_sync     = dquot_quota_sync,
        .get_info       = dquot_get_dqinfo,
@@ -2091,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
+        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
        sb->s_qcop = &ocfs2_quotactl_ops;
        sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023..e52389e1f05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        /* Punch hole must have keep size set */
+        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;
        if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;
-        if (!inode->i_op->fallocate)
+        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;
-        return inode->i_op->fallocate(inode, mode, offset, len);
+        return file->f_op->fallocate(file, mode, offset, len);
 }
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e..a2a5bff774e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
+static void openprom_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, openprom_i_callback);
+}
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
        struct inode *inode;
@@ -418,7 +425,7 @@ out_no_root:
 static struct dentry *openprom_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_single(fs_type, flags, data, openprom_fill_super)
+        return mount_single(fs_type, flags, data, openprom_fill_super);
 }
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e..9c21119512b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_ro_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
 ssize_t part_alignment_offset_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
                   NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_ro.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
        put_device(part_to_dev(part));
 }
+void __delete_partition(struct hd_struct *part)
+{
+        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
 void delete_partition(struct gendisk *disk, int partno)
 {
        struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
-        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+        hd_struct_put(part);
 }
 static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
+        hd_ref_init(p);
        return p;
 out_free_info:
@@ -507,65 +522,6 @@ out_put:
        return ERR_PTR(err);
 }
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-        struct device *ddev = disk_to_dev(disk);
-        struct block_device *bdev;
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        int err;
-        ddev->parent = disk->driverfs_dev;
-        dev_set_name(ddev, disk->disk_name);
-        /* delay uevents, until we scanned partition table */
-        dev_set_uevent_suppress(ddev, 1);
-        if (device_add(ddev))
-                return;
-        if (!sysfs_deprecated) {
-                err = sysfs_create_link(block_depr, &ddev->kobj,
-                                        kobject_name(&ddev->kobj));
-                if (err) {
-                        device_del(ddev);
-                        return;
-                }
-        }
-        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-        /* No minors to use for partitions */
-        if (!disk_partitionable(disk))
-                goto exit;
-        /* No such device (e.g., media were just removed) */
-        if (!get_capacity(disk))
-                goto exit;
-        bdev = bdget_disk(disk, 0);
-        if (!bdev)
-                goto exit;
-        bdev->bd_invalidated = 1;
-        err = blkdev_get(bdev, FMODE_READ);
-        if (err < 0)
-                goto exit;
-        blkdev_put(bdev, FMODE_READ);
-exit:
-        /* announce disk after possible partitions are created */
-        dev_set_uevent_suppress(ddev, 0);
-        kobject_uevent(&ddev->kobj, KOBJ_ADD);
-        /* announce possible partitions */
-        disk_part_iter_init(&piter, disk, 0);
-        while ((part = disk_part_iter_next(&piter)))
-                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-        disk_part_iter_exit(&piter);
-}
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
        const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
 }
 EXPORT_SYMBOL(read_dev_sector);
-void del_gendisk(struct gendisk *disk)
-{
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        /* invalidate stuff */
-        disk_part_iter_init(&piter, disk,
-                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-        while ((part = disk_part_iter_next(&piter))) {
-                invalidate_partition(disk, part->partno);
-                delete_partition(disk, part->partno);
-        }
-        disk_part_iter_exit(&piter);
-        invalidate_partition(disk, 0);
-        blk_free_devt(disk_to_dev(disk)->devt);
-        set_capacity(disk, 0);
-        disk->flags &= ~GENHD_FL_UP;
-        unlink_gendisk(disk);
-        part_stat_set_all(&disk->part0, 0);
-        disk->part0.stamp = 0;
-        kobject_put(disk->part0.holder_dir);
-        kobject_put(disk->slave_dir);
-        disk->driverfs_dev = NULL;
-        if (!sysfs_deprecated)
-                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-        device_del(disk_to_dev(disk));
-}
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a95572..da42f7db50d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
                }
                pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
        /* Signal writers asynchronously that there is more room. */
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                        do_wakeup = 0;
                }
@@ -623,7 +623,7 @@ redo2:
 out:
        mutex_unlock(&inode->i_mutex);
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        if (!pipe->readers && !pipe->writers) {
                free_pipe_info(inode);
        } else {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
@@ -999,12 +999,11 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
        if (!path.dentry)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
-        path.dentry->d_op = &pipefs_dentry_operations;
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
@@ -1199,12 +1198,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
        return ret;
 }
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+        struct inode *i = file->f_path.dentry->d_inode;
+        return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct pipe_inode_info *pipe;
        long ret;
-        pipe = file->f_path.dentry->d_inode->i_pipe;
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1241,6 +1252,10 @@ out:
        return ret;
 }
+static const struct super_operations pipefs_ops = {
+        .destroy_inode = free_inode_nonrcu,
+};
 /*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
@@ -1250,7 +1265,8 @@ out:
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
                         int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+        return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+                        &pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748..d42514e3238 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
 */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+        int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
        return (mycount > count);
 }
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
 *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec2..b1cf6bf4b41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
+EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
 EXPORT_SYMBOL(posix_acl_clone);
 EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
 EXPORT_SYMBOL(posix_acl_permission);
 /*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+        atomic_set(&acl->a_refcount, 1);
+        acl->a_count = count;
+}
+/*
 * Allocate a new ACL with the specified number of entries.
 */
 struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
        const size_t size = sizeof(struct posix_acl) +
                            count * sizeof(struct posix_acl_entry);
        struct posix_acl *acl = kmalloc(size, flags);
-        if (acl) {
+        if (acl)
-                atomic_set(&acl->a_refcount, 1);
+                posix_acl_init(acl, count);
-                acl->a_count = count;
-        }
        return acl;
 }
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d9..15af6222f8a 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
 config PROC_FS
-        bool "/proc file system support" if EMBEDDED
+        bool "/proc file system support" if EXPERT
        default y
        help
          This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
        Exports the dump image of crashed kernel in ELF format.
 config PROC_SYSCTL
-        bool "Sysctl support (/proc/sys)" if EMBEDDED
+        bool "Sysctl support (/proc/sys)" if EXPERT
        depends on PROC_FS
        select SYSCTL
        default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
 config PROC_PAGE_MONITOR
        default y
        depends on PROC_FS && MMU
-        bool "Enable /proc page monitoring" if EMBEDDED
+        bool "Enable /proc page monitoring" if EXPERT
        help
          Various /proc files exist to monitor process memory utilization:
          /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc51..df434c5f28f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
                proc_tty.o
 proc-y  += cmdline.o
+proc-y  += consoles.o
 proc-y  += cpuinfo.o
 proc-y  += devices.o
 proc-y  += interrupts.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676a..df2b703b9d0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
        get_task_comm(tcomm, p);
-        seq_printf(m, "Name:\t");
+        seq_puts(m, "Name:\t");
        end = m->buf + m->size;
        buf = m->buf + m->count;
        name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
                buf++;
        }
        m->count = buf - m->buf;
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 /*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                seq_printf(m, "%d ", GROUP_AT(group_info, g));
        put_cred(cred);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 {
        int i;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        i = _NSIG;
        do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
                seq_printf(m, "%x", x);
        } while (i >= 4);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 {
        unsigned __capi;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        CAP_FOR_EACH_U32(__capi) {
                seq_printf(m, "%08x",
                           a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
        }
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
+        seq_puts(m, "Cpus_allowed:\t");
        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
-        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_puts(m, "Cpus_allowed_list:\t");
        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
-        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+        unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        seq_printf(m, "%d %d %d %d %d %d %d\n",
+        seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
-                        size, resident, shared, text, lib, data, 0);
+                        size, resident, shared, text, data);
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461e..9d096e82b20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < 32; i++) {
-                if (task->latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &task->latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %li %li ",
+                        seq_printf(m, "%i %li %li",
-                                task->latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                task->latency_record[i].time,
-                                task->latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!task->latency_record[i].backtrace[q])
                                        break;
-                                if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
-                        seq_printf(m, "\n");
+                        seq_putc(m, '\n');
                }
        }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, proc_single_show, inode);
-        ret = single_open(filp, proc_single_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj &&
+        if (oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                        atomic_dec(&task->mm->oom_disable_count);
        }
        task->signal->oom_score_adj = oom_score_adj;
+        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+                task->signal->oom_score_adj_min = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1386,9 +1375,77 @@ sched_write(struct file *file, const char __user *buf,
 static int sched_open(struct inode *inode, struct file *filp)
 {
+        return single_open(filp, sched_show, inode);
+}
+static const struct file_operations proc_pid_sched_operations = {
+        .open           = sched_open,
+        .read           = seq_read,
+        .write          = sched_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_autogroup_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[PROC_NUMBUF];
+        long nice;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &nice);
+        if (err)
+                return -EINVAL;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        err = nice;
+        err = proc_sched_autogroup_set_nice(p, &err);
+        if (err)
+                count = err;
+        put_task_struct(p);
+        return count;
+}
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
        int ret;
-        ret = single_open(filp, sched_show, NULL);
+        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;
@@ -1397,15 +1454,15 @@ static int sched_open(struct inode *inode, struct file *filp)
        return ret;
 }
-static const struct file_operations proc_pid_sched_operations = {
+static const struct file_operations proc_pid_sched_autogroup_operations = {
-        .open           = sched_open,
+        .open           = sched_autogroup_open,
        .read           = seq_read,
-        .write          = sched_write,
+        .write          = sched_autogroup_write,
        .llseek         = seq_lseek,
        .release        = single_release,
 };
-#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
@@ -1454,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
 static int comm_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, comm_show, inode);
-        ret = single_open(filp, comm_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_pid_set_comm_operations = {
@@ -1574,7 +1623,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
+        pathname = d_path(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
@@ -1719,10 +1768,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
@@ -1744,7 +1799,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
        return 0;
 }
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
@@ -1888,12 +1943,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
-        int fd = proc_fd(inode);
+        int fd;
        struct files_struct *files;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        fd = proc_fd(inode);
        if (task) {
                files = get_files_struct(task);
                if (files) {
@@ -1969,7 +2031,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2101,11 +2163,13 @@ static const struct file_operations proc_fd_operations = {
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
 */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int rv;
-        rv = generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        rv = generic_permission(inode, mask, flags, NULL);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
@@ -2137,7 +2201,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2196,7 +2260,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, NULL))
@@ -2563,8 +2627,14 @@ static const struct pid_entry proc_base_stuff[] = {
 */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                put_task_struct(task);
                return 1;
@@ -2615,7 +2685,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &proc_base_dentry_operations;
+        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2733,6 +2803,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
@@ -2926,7 +2999,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
                ARRAY_SIZE(tgid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -3169,7 +3242,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
                ARRAY_SIZE(tid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 00000000000..b701eaa482b
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+        static const struct {
+                short flag;
+                char name;
+        } con_flags[] = {
+                { CON_ENABLED,          'E' },
+                { CON_CONSDEV,          'C' },
+                { CON_BOOT,             'B' },
+                { CON_PRINTBUFFER,      'p' },
+                { CON_BRL,              'b' },
+                { CON_ANYTIME,          'a' },
+        };
+        char flags[ARRAY_SIZE(con_flags) + 1];
+        struct console *con = v;
+        unsigned int a;
+        int len;
+        dev_t dev = 0;
+        if (con->device) {
+                const struct tty_driver *driver;
+                int index;
+                driver = con->device(con, &index);
+                if (driver) {
+                        dev = MKDEV(driver->major, driver->minor_start);
+                        dev += index;
+                }
+        }
+        for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+                flags[a] = (con->flags & con_flags[a].flag) ?
+                        con_flags[a].name : ' ';
+        flags[a] = 0;
+        seq_printf(m, "%s%d%n", con->name, con->index, &len);
+        len = 21 - len;
+        if (len < 1)
+                len = 1;
+        seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+                        con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+                        flags);
+        if (dev)
+                seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+        seq_printf(m, "\n");
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        struct console *con;
+        loff_t off = 0;
+        console_lock();
+        for_each_console(con)
+                if (off++ == *pos)
+                        break;
+        return con;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct console *con = v;
+        ++*pos;
+        return con->next;
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+        console_unlock();
+}
+static const struct seq_operations consoles_op = {
+        .start  = c_start,
+        .next   = c_next,
+        .stop   = c_stop,
+        .show   = show_console_dev
+};
+static int consoles_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &consoles_op);
+}
+static const struct file_operations proc_consoles_operations = {
+        .open           = consoles_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init proc_consoles_init(void)
+{
+        proc_create("consoles", 0, NULL, &proc_consoles_operations);
+        return 0;
+}
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c..b14347167c3 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
        if (i < CHRDEV_MAJOR_HASH_SIZE) {
                if (i == 0)
-                        seq_printf(f, "Character devices:\n");
+                        seq_puts(f, "Character devices:\n");
                chrdev_show(f, i);
        }
 #ifdef CONFIG_BLOCK
        else {
                i -= CHRDEV_MAJOR_HASH_SIZE;
                if (i == 0)
-                        seq_printf(f, "\nBlock devices:\n");
+                        seq_puts(f, "\nBlock devices:\n");
                blkdev_show(f, i);
        }
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f033766..01e07f2a188 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
        return 1;
 }
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                if (de->namelen != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-                        unsigned int ino;
-                        ino = de->low_ino;
                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
-                        inode = proc_get_inode(dir->i_sb, ino, de);
+                        inode = proc_get_inode(dir->i_sb, de);
                        goto out_unlock;
                }
        }
@@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 out_unlock:
        if (inode) {
-                dentry->d_op = &proc_dentry_operations;
+                d_set_d_op(dentry, &proc_dentry_operations);
                d_add(dentry, inode);
                return NULL;
        }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-        unsigned int ino = de->low_ino;
+        release_inode_number(de->low_ino);
-        if (ino < PROC_DYNAMIC_FIRST)
-                return;
-        release_inode_number(ino);
        if (S_ISLNK(de->mode))
                kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                wait_for_completion(de->pde_unload_completion);
-                goto continue_removing;
+                spin_lock(&de->pde_unload_lock);
        }
-        spin_unlock(&de->pde_unload_lock);
-continue_removing:
-        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f48487..176ce4cda68 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
 #include <linux/limits.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -66,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
+static void proc_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, proc_i_callback);
+}
 static void init_once(void *foo)
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
@@ -410,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
-                                struct proc_dir_entry *de)
 {
        struct inode * inode;
-        inode = iget_locked(sb, ino);
+        inode = iget_locked(sb, de->low_ino);
        if (!inode)
                return NULL;
        if (inode->i_state & I_NEW) {
@@ -465,7 +470,7 @@ int proc_fill_super(struct super_block *s)
        s->s_time_gran = 1;
        
        pde_get(&proc_root);
-        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+        root_inode = proc_get_inode(s, &proc_root);
        if (!root_inode)
                goto out_no_root;
        root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd1..9ad561ded40 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+        unsigned long *, unsigned long *, unsigned long *, unsigned long *);
 void task_mem(struct seq_file *, struct mm_struct *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468..d245cb23dd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97..ed257d14156 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                "HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                "AnonHugePages:  %8lu kB\n"
+#endif
                ,
                K(i.totalram),
                K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)),
+                K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                  HPAGE_PMD_NR
+#endif
+                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                   HPAGE_PMD_NR)
+#endif
                );
        hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b4566033..6d8e6a9e93a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                        ppage = pfn_to_page(pfn);
                else
                        ppage = NULL;
-                if (!ppage)
+                if (!ppage || PageSlab(ppage))
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
-        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        /*
-         * Caveats on high order pages:
+         * Caveats on high order pages: page->_count will only be set
-         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * -1 on the head page; SLUB/SLQB do the same for PG_slab;
-         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         * SLOB won't set PG_slab at all on compound pages.
         */
+        if (PageBuddy(page))
+                u |= 1 << KPF_BUDDY;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
-        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906..09a1f92a34e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -120,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        err = NULL;
-        dentry->d_op = &proc_sys_dentry_operations;
+        d_set_d_op(dentry, &proc_sys_dentry_operations);
        d_add(dentry, inode);
 out:
@@ -201,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                                dput(child);
                                return -ENOMEM;
                        } else {
-                                child->d_op = &proc_sys_dentry_operations;
+                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
@@ -294,7 +295,7 @@ out:
        return ret;
 }
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
        /*
         * sysctl entries that are not writeable,
@@ -304,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
        struct ctl_table *table;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /* Executable files are not allowed under /proc/sys/ */
        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
                return -EACCES;
@@ -389,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+static int proc_sys_compare(const struct dentry *parent,
-                            struct qstr *name)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+        /* Although proc doesn't have negative dentries, rcu-walk means
-        if (qstr->len != name->len)
+         * that inode here can be NULL */
+        if (!inode)
+                return 0;
+        if (name->len != len)
                return 1;
-        if (memcmp(qstr->name, name->name, name->len))
+        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+        return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..cb761f01030 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
        }
        switch (p->type) {
        case TTY_DRIVER_TYPE_SYSTEM:
-                seq_printf(m, "system");
+                seq_puts(m, "system");
                if (p->subtype == SYSTEM_TYPE_TTY)
-                        seq_printf(m, ":/dev/tty");
+                        seq_puts(m, ":/dev/tty");
                else if (p->subtype == SYSTEM_TYPE_SYSCONS)
-                        seq_printf(m, ":console");
+                        seq_puts(m, ":console");
                else if (p->subtype == SYSTEM_TYPE_CONSOLE)
-                        seq_printf(m, ":vtmaster");
+                        seq_puts(m, ":vtmaster");
                break;
        case TTY_DRIVER_TYPE_CONSOLE:
-                seq_printf(m, "console");
+                seq_puts(m, "console");
                break;
        case TTY_DRIVER_TYPE_SERIAL:
-                seq_printf(m, "serial");
+                seq_puts(m, "serial");
                break;
        case TTY_DRIVER_TYPE_PTY:
                if (p->subtype == PTY_TYPE_MASTER)
-                        seq_printf(m, "pty:master");
+                        seq_puts(m, "pty:master");
                else if (p->subtype == PTY_TYPE_SLAVE)
-                        seq_printf(m, "pty:slave");
+                        seq_puts(m, "pty:slave");
                else
-                        seq_printf(m, "pty");
+                        seq_puts(m, "pty");
                break;
        default:
                seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
                /* pseudo-drivers first */
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
-                seq_printf(m, "system:/dev/tty\n");
+                seq_puts(m, "system:/dev/tty\n");
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
-                seq_printf(m, "system:console\n");
+                seq_puts(m, "system:console\n");
 #ifdef CONFIG_UNIX98_PTYS
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
-                seq_printf(m, "system\n");
+                seq_puts(m, "system\n");
 #endif
 #ifdef CONFIG_VT
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
                seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
-                seq_printf(m, "system:vtmaster\n");
+                seq_puts(m, "system:vtmaster\n");
 #endif
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c98..62604be9f58 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                    ");
+        seq_puts(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        for (i = 0; i < NR_SOFTIRQS; i++) {
                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
-                seq_printf(p, "\n");
+                seq_putc(p, '\n');
        }
        return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93ba..1cffa2b8a2f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
        for (i = 0; i < NR_SOFTIRQS; i++)
                seq_printf(p, " %u", per_softirq_sums[i]);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f0..60b914860f8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
        return PAGE_SIZE * mm->total_vm;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        *shared = get_mm_counter(mm, MM_FILEPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
-                   "MMUPageSize:    %8lu kB\n",
+                   "MMUPageSize:    %8lu kB\n"
+                   "Locked:         %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
-                   vma_mmu_pagesize(vma) >> 10);
+                   vma_mmu_pagesize(vma) >> 10,
+                   (vma->vm_flags & VM_LOCKED) ?
+                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
@@ -706,6 +710,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 * skip over unmapped regions.
 */
 #define PAGEMAP_WALK_SIZE       (PMD_SIZE)
+#define PAGEMAP_WALK_MASK       (PMD_MASK)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -776,7 +781,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                unsigned long end;
                pm.pos = 0;
-                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
                /* overflow ? */
                if (end < start_vaddr || end > end_vaddr)
                        end = end_vaddr;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e6384..b535d3e5d5f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
        return vsize;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *p;
-        int size = kobjsize(mm);
+        unsigned long size = kobjsize(mm);
        down_read(&mm->mmap_sem);
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70b..74802bc5ded 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
        /* Do some basic Verification. */
        if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
                (ehdr.e_type != ET_CORE) ||
-                !vmcore_elf_check_arch(&ehdr) ||
+                !vmcore_elf64_check_arch(&ehdr) ||
                ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
                ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
                ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa..e63b4171d58 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
+static void qnx4_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
 static void init_once(void *foo)
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efc..a2a622e079f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 void __quota_error(struct super_block *sb, const char *func,
-                  const char *fmt, ...)
+                   const char *fmt, ...)
 {
-        va_list args;
        if (printk_ratelimit()) {
+                va_list args;
+                struct va_format vaf;
                va_start(args, fmt);
-                printk(KERN_ERR "Quota error (device %s): %s: ",
-                       sb->s_id, func);
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
-                printk("\n");
+                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+                       sb->s_id, func, &vaf);
                va_end(args);
        }
 }
@@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
-                      struct path *path)
+                   struct path *path)
 {
        int error = security_quota_on(path->dentry);
        if (error)
@@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-        struct path path;
-        int error;
-        error = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (!error) {
-                error = dquot_quota_on_path(sb, type, format_id, &path);
-                path_put(&path);
-        }
-        return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1ed..b34bdb25490 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-                         void __user *addr)
+                         struct path *path)
 {
-        char *pathname;
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
-        int ret = -ENOSYS;
+                return -ENOSYS;
+        if (sb->s_qcop->quota_on_meta)
-        pathname = getname(addr);
+                return sb->s_qcop->quota_on_meta(sb, type, id);
-        if (IS_ERR(pathname))
+        if (IS_ERR(path))
-                return PTR_ERR(pathname);
+                return PTR_ERR(path);
-        if (sb->s_qcop->quota_on)
+        return sb->s_qcop->quota_on(sb, type, id, path);
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-        putname(pathname);
-        return ret;
 }
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-                       void __user *addr)
+                       void __user *addr, struct path *path)
 {
        int ret;
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        switch (cmd) {
        case Q_QUOTAON:
-                return quota_quotaon(sb, type, cmd, id, addr);
+                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
        uint cmds, type;
        struct super_block *sb = NULL;
+        struct path path, *pathp = NULL;
        int ret;
        cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
                return -ENODEV;
        }
+        /*
+         * Path for quotaon has to be resolved before grabbing superblock
+         * because that gets s_umount sem which is also possibly needed by path
+         * resolution (think about autofs) and thus deadlocks could arise.
+         */
+        if (cmds == Q_QUOTAON) {
+                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+                if (ret)
+                        pathp = ERR_PTR(ret);
+                else
+                        pathp = &path;
+        }
        sb = quotactl_block(special);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
-        ret = do_quotactl(sb, type, cmds, id, addr);
+        ret = do_quotactl(sb, type, cmds, id, addr, pathp);
        drop_super(sb);
+        if (pathp && !IS_ERR(pathp))
+                path_put(pathp);
        return ret;
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabc..e41c1becf09 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                quota_error(dquot->dq_sb, "Can't read quota data "
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
-                            "block %u", blk);
+                            *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                quota_error(dquot->dq_sb, "Can't write quota "
+                                quota_error(dquot->dq_sb,
-                                            "tree block %u", blk);
+                                            "Can't write quota tree block %u",
+                                            *blk);
                }
        }
 out_buf:
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c..5520f8ad550 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/smp_lock.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
@@ -31,18 +30,9 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-static int
+static inline int unsigned_offsets(struct file *file)
-__negative_fpos_check(struct file *file, loff_t pos, size_t count)
 {
-        /*
+        return file->f_mode & FMODE_UNSIGNED_OFFSET;
-         * pos or pos+count is negative here, check overflow.
-         * too big "count" will be caught in rw_verify_area().
-         */
-        if ((pos < 0) && (pos + count < pos))
-                return -EOVERFLOW;
-        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
-                return 0;
-        return -EINVAL;
 }
 /**
@@ -76,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 && __negative_fpos_check(file, offset, 0))
+        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
@@ -153,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
+        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -253,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
+        if (unlikely(pos < 0)) {
-                retval = __negative_fpos_check(file, pos, count);
+                if (!unsigned_offsets(file))
-                if (retval)
+                        return retval;
+                if (count >= -pos) /* both values are in 0..LLONG_MAX */
+                        return -EOVERFLOW;
+        } else if (unlikely((loff_t) (pos + count) < 0)) {
+                if (!unsigned_offsets(file))
                        return retval;
        }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5..0bae036831e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485ce..79265fdc317 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/time.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/compat.h>
 /*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
                return 0;
        }
-        /* we need to make sure nobody is changing the file size beneath
-         ** us
-         */
-        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        depth = reiserfs_write_lock_once(inode->i_sb);
+        /* we need to make sure nobody is changing the file size beneath us */
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
        if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b19468..3eea859e699 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
@@ -2552,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
        result = 0;
        if (journal->j_dev_bd != NULL) {
-                if (journal->j_dev_bd->bd_dev != super->s_dev)
-                        bd_release(journal->j_dev_bd);
                result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
                journal->j_dev_bd = NULL;
        }
@@ -2571,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
 {
        int result;
        dev_t jdev;
-        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
        char b[BDEVNAME_SIZE];
        result = 0;
@@ -2585,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
        /* there is no "jdev" option and journal is on separate device */
        if ((!jdev_name || !jdev_name[0])) {
-                journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+                if (jdev == super->s_dev)
+                        blkdev_mode &= ~FMODE_EXCL;
+                journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+                                                      journal);
                journal->j_dev_mode = blkdev_mode;
                if (IS_ERR(journal->j_dev_bd)) {
                        result = PTR_ERR(journal->j_dev_bd);
@@ -2594,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
                                         "cannot init journal device '%s': %i",
                                         __bdevname(jdev, b), result);
                        return result;
-                } else if (jdev != super->s_dev) {
+                } else if (jdev != super->s_dev)
-                        result = bd_claim(journal->j_dev_bd, journal);
-                        if (result) {
-                                blkdev_put(journal->j_dev_bd, blkdev_mode);
-                                return result;
-                        }
                        set_blocksize(journal->j_dev_bd, super->s_blocksize);
-                }
                return 0;
        }
        journal->j_dev_mode = blkdev_mode;
-        journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+        journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
-                                                blkdev_mode, journal);
        if (IS_ERR(journal->j_dev_bd)) {
                result = PTR_ERR(journal->j_dev_bd);
                journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f53851..45de98b5946 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int l
        va_list args;
        int mode, first, last;
-        va_start(args, bh);
        if (!bh) {
                printk("print_block: buffer is NULL\n");
                return;
        }
+        va_start(args, bh);
        mode = va_arg(args, int);
        first = va_arg(args, int);
        last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4..0aab04f4682 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -530,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -626,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -2042,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name)
+                             struct path *path)
 {
        int err;
-        struct path path;
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb) {
                err = -EXDEV;
                goto out;
        }
-        inode = path.dentry->d_inode;
+        inode = path->dentry->d_inode;
        /* We must not pack tails for quota files on reiserfs for quota IO to work */
        if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
                err = reiserfs_unpack(inode, NULL);
@@ -2076,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        /* Journaling quota? */
        if (REISERFS_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        reiserfs_warning(sb, "super-6521",
                                 "Quota file not on filesystem root. "
                                 "Journalled quota will not work.");
@@ -2095,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on(sb, type, format_id, path);
 out:
-        path_put(&path);
        return err;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5d04a7828e7..3cfb2e93364 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
        return err;
 }
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error = -EAGAIN; /* do regular unix permission checks by default */
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
        return 0;
 }
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /*
         * We don't do permission checks on the internal objects.
         * Permissions are determined by the "owning" object.
@@ -965,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
         * Stat data v1 doesn't support ACLs.
         */
        if (get_inode_sd_version(inode) != STAT_DATA_V1)
-                return generic_permission(inode, mask, reiserfs_check_acl);
+                return generic_permission(inode, mask, flags,
+                                        reiserfs_check_acl);
 #endif
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return -EPERM;
 }
@@ -990,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                dentry->d_op = &xattr_lookup_poison_ops;
+                d_set_d_op(dentry, &xattr_lookup_poison_ops);
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a2..90d2fcb67a3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
                struct reiserfs_transaction_handle th;
                size_t size = reiserfs_xattr_nblocks(inode,
                                             reiserfs_acl_size(clone->a_count));
-                reiserfs_write_lock(inode->i_sb);
+                int depth;
+                depth = reiserfs_write_lock_once(inode->i_sb);
                error = journal_begin(&th, inode->i_sb, size * 2);
                if (!error) {
                        int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
                        if (error2)
                                error = error2;
                }
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, depth);
        }
        posix_acl_release(clone);
        return error;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55c..2305e3121cb 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
 * return a spent inode to the slab cache
 */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
+static void romfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, romfs_i_callback);
+}
 /*
 * get filesystem statistics
 */
diff --git a/fs/select.c b/fs/select.c
index b7b10aa3086..e56560d2b08 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
                rts.tv_sec = rts.tv_nsec = 0;
        if (timeval) {
+                if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+                        memset(&rtv, 0, sizeof(rtv));
                rtv.tv_sec = rts.tv_sec;
                rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f..50a5d978da1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
        loff_t pos = sd->pos;
-        int ret, more;
+        int more;
-        ret = buf->ops->confirm(pipe, buf);
-        if (!ret) {
-                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-                if (file->f_op && file->f_op->sendpage)
-                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                                   sd->len, &pos, more);
-                else
-                        ret = -EINVAL;
-        }
-        return ret;
+        if (!likely(file->f_op && file->f_op->sendpage))
+                return -EINVAL;
+        more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+        return file->f_op->sendpage(file, buf->page, buf->offset,
+                                    sd->len, &pos, more);
 }
 /*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        void *fsdata;
        int ret;
-        /*
-         * make sure the data in this buffer is uptodate
-         */
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        offset = sd->pos & ~PAGE_CACHE_MASK;
        this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;
-                ret = actor(pipe, buf, sd);
+                ret = buf->ops->confirm(pipe, buf);
-                if (ret <= 0) {
+                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }
+                ret = actor(pipe, buf, sd);
+                if (ret <= 0)
+                        return ret;
                buf->offset += ret;
                buf->len -= ret;
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        int ret;
        void *data;
-        ret = buf->ops->confirm(pipe, buf);
-        if (ret)
-                return ret;
        data = buf->ops->map(pipe, buf, 0);
        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
        buf->ops->unmap(pipe, buf, data);
@@ -1311,18 +1300,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
-        if (S_ISFIFO(inode->i_mode))
-                return inode->i_pipe;
-        return NULL;
-}
 /*
 * Determine where to splice to/from.
@@ -1336,8 +1313,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
        loff_t offset, *off;
        long ret;
-        ipipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = get_pipe_info(in);
-        opipe = pipe_info(out->f_path.dentry->d_inode);
+        opipe = get_pipe_info(out);
        if (ipipe && opipe) {
                if (off_in || off_out)
@@ -1507,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        char *src;
        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        /*
         * See if we can use the atomic maps, by prefaulting in the
         * pages and doing an atomic copy
@@ -1555,7 +1528,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        int error;
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1642,7 +1615,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
        };
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -2022,8 +1995,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 static long do_tee(struct file *in, struct file *out, size_t len,
                   unsigned int flags)
 {
-        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
+        struct pipe_inode_info *ipipe = get_pipe_info(in);
-        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+        struct pipe_inode_info *opipe = get_pipe_info(out);
        int ret = -EINVAL;
        /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d0..aa68a8a3151 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
 config SQUASHFS_XATTR
        bool "Squashfs XATTR support"
        depends on SQUASHFS
-        default n
        help
          Saying Y here includes support for extended attributes (xattrs).
          Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
        bool "Include support for LZO compressed file systems"
        depends on SQUASHFS
-        default n
        select LZO_DECOMPRESS
        help
          Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
          If unsure, say N.
+config SQUASHFS_XZ
+        bool "Include support for XZ compressed file systems"
+        depends on SQUASHFS
+        select XZ_DEC
+        help
+          Saying Y here includes support for reading Squashfs file systems
+          compressed with XZ compresssion.  XZ gives better compression than
+          the default zlib compression, at the expense of greater CPU and
+          memory overhead.
+          XZ is not the standard compression used in Squashfs and so most
+          file systems will be readable without selecting this option.
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
-        default n
        help
          Saying Y here allows you to specify cache size.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d32..cecf2bea07a 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb84..8ab48bc2fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
                *length = (unsigned char) bh->b_data[*offset] |
                        (unsigned char) bh->b_data[*offset + 1] << 8;
                *offset += 2;
+                if (*offset == msblk->devblksize) {
+                        put_bh(bh);
+                        bh = sb_bread(sb, ++(*cur_index));
+                        if (bh == NULL)
+                                return NULL;
+                        *offset = 0;
+                }
        }
        return bh;
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee905..26b15ae34d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722..a5940e54c4d 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "decompressor.h"
 #include "squashfs.h"
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 };
 #ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, 0, "unknown", 0
 };
 static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
-        &squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
        &squashfs_lzo_comp_ops,
-#else
+        &squashfs_xz_comp_ops,
-        &squashfs_lzo_unsupported_comp_ops,
+        &squashfs_lzma_unsupported_comp_ops,
-#endif
        &squashfs_unknown_comp_ops
 };
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f..3b305a70f7a 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
        return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
                length, srclength, pages);
 }
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
 #endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879..7eef571443c 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b7..d8f32452638 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c..7da759e34c5 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f7..ba729d80887 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
 #define WARNING(s, args...)     pr_warning("SQUASHFS: "s, ## args)
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
-        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
                                int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab1..39533feffd6 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
 #define ZLIB_COMPRESSION        1
 #define LZMA_COMPRESSION        2
 #define LZO_COMPRESSION         3
+#define XZ_COMPRESSION          4
 struct squashfs_super_block {
        __le32                  s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a..359baefc01f 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
        };
        struct inode    vfs_inode;
 };
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c..20700b9f2b4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c3..05385dbe146 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 00000000000..c4eb4001825
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,147 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+struct squashfs_xz {
+        struct xz_dec *state;
+        struct xz_buf buf;
+};
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+        if (stream->state == NULL)
+                goto failed;
+        return stream;
+failed:
+        ERROR("Failed to allocate xz workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void squashfs_xz_free(void *strm)
+{
+        struct squashfs_xz *stream = strm;
+        if (stream) {
+                xz_dec_end(stream->state);
+                kfree(stream);
+        }
+}
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        enum xz_ret xz_err;
+        int avail, total = 0, k = 0, page = 0;
+        struct squashfs_xz *stream = msblk->stream;
+        mutex_lock(&msblk->read_data_mutex);
+        xz_dec_reset(stream->state);
+        stream->buf.in_pos = 0;
+        stream->buf.in_size = 0;
+        stream->buf.out_pos = 0;
+        stream->buf.out_size = PAGE_CACHE_SIZE;
+        stream->buf.out = buffer[page++];
+        do {
+                if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+                        avail = min(length, msblk->devblksize - offset);
+                        length -= avail;
+                        wait_on_buffer(bh[k]);
+                        if (!buffer_uptodate(bh[k]))
+                                goto release_mutex;
+                        stream->buf.in = bh[k]->b_data + offset;
+                        stream->buf.in_size = avail;
+                        stream->buf.in_pos = 0;
+                        offset = 0;
+                }
+                if (stream->buf.out_pos == stream->buf.out_size
+                                                        && page < pages) {
+                        stream->buf.out = buffer[page++];
+                        stream->buf.out_pos = 0;
+                        total += PAGE_CACHE_SIZE;
+                }
+                xz_err = xz_dec_run(stream->state, &stream->buf);
+                if (stream->buf.in_pos == stream->buf.in_size && k < b)
+                        put_bh(bh[k++]);
+        } while (xz_err == XZ_OK);
+        if (xz_err != XZ_STREAM_END) {
+                ERROR("xz_dec_run error, data probably corrupt\n");
+                goto release_mutex;
+        }
+        if (k < b) {
+                ERROR("xz_uncompress error, input remaining\n");
+                goto release_mutex;
+        }
+        total += stream->buf.out_pos;
+        mutex_unlock(&msblk->read_data_mutex);
+        return total;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+        for (; k < b; k++)
+                put_bh(bh[k]);
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        .init = squashfs_xz_init,
+        .free = squashfs_xz_free,
+        .decompress = squashfs_xz_uncompress,
+        .id = XZ_COMPRESSION,
+        .name = "xz",
+        .supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e48..4661ae2b1ce 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        struct buffer_head **bh, int b, int offset, int length, int srclength,
        int pages)
 {
-        int zlib_err = 0, zlib_init = 0;
+        int zlib_err, zlib_init = 0;
-        int avail, bytes, k = 0, page = 0;
+        int k = 0, page = 0;
        z_stream *stream = msblk->stream;
        mutex_lock(&msblk->read_data_mutex);
@@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        stream->avail_out = 0;
        stream->avail_in = 0;
-        bytes = length;
        do {
                if (stream->avail_in == 0 && k < b) {
-                        avail = min(bytes, msblk->devblksize - offset);
+                        int avail = min(length, msblk->devblksize - offset);
-                        bytes -= avail;
+                        length -= avail;
                        wait_on_buffer(bh[k]);
                        if (!buffer_uptodate(bh[k]))
                                goto release_mutex;
-                        if (avail == 0) {
-                                offset = 0;
-                                put_bh(bh[k++]);
-                                continue;
-                        }
                        stream->next_in = bh[k]->b_data + offset;
                        stream->avail_in = avail;
                        offset = 0;
@@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        if (k < b) {
+                ERROR("zlib_uncompress error, data remaining\n");
+                goto release_mutex;
+        }
        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
        return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e21390..d5c61cf2b70 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
+        if (flag & AT_NO_AUTOMOUNT)
+                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9..74e149efed8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_LIST_HEAD(&s->s_files);
 #endif
                INIT_LIST_HEAD(&s->s_instances);
-                INIT_HLIST_HEAD(&s->s_anon);
+                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
                init_rwsem(&s->s_umount);
@@ -766,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
@@ -801,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
                /*
                 * s_umount nests inside bd_mutex during
-                 * __invalidate_device().  close_bdev_exclusive()
+                 * __invalidate_device().  blkdev_put() acquires
-                 * acquires bd_mutex and can't be called under
+                 * bd_mutex and can't be called under s_umount.  Drop
-                 * s_umount.  Drop s_umount temporarily.  This is safe
+                 * s_umount temporarily.  This is safe as we're
-                 * as we're holding an active reference.
+                 * holding an active reference.
                 */
                up_write(&s->s_umount);
-                close_bdev_exclusive(bdev, mode);
+                blkdev_put(bdev, mode);
                down_write(&s->s_umount);
        } else {
                char b[BDEVNAME_SIZE];
@@ -831,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
        error = PTR_ERR(s);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
 error:
        return ERR_PTR(error);
 }
@@ -862,7 +863,8 @@ void kill_block_super(struct super_block *sb)
        bdev->bd_super = NULL;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
-        close_bdev_exclusive(bdev, mode);
+        WARN_ON_ONCE(!(mode & FMODE_EXCL));
+        blkdev_put(bdev, mode | FMODE_EXCL);
 }
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d..8c41feacbac 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
 config SYSFS
-        bool "sysfs file system support" if EMBEDDED
+        bool "sysfs file system support" if EXPERT
        default y
        help
        The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b..ea9120a830d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct sysfs_dirent *sd;
        int is_dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        sd = dentry->d_fsdata;
        mutex_lock(&sysfs_mutex);
        /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        /* instantiate and hash dentry */
        ret = d_find_alias(inode);
        if (!ret) {
-                dentry->d_op = &sysfs_dentry_ops;
+                d_set_d_op(dentry, &sysfs_dentry_ops);
                dentry->d_fsdata = sysfs_get(sd);
                d_add(dentry, inode);
        } else {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af..c8769dc222d 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
        struct attribute *const *attr;
        int i;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (!dir_sd)
                return -ENOENT;
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        struct attribute *const *attr;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
                for (attr = grp->attrs; *attr; ++attr)
                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba3..0a12eb89cd3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
                return -ENOENT;
 }
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct sysfs_dirent *sd = inode->i_private;
+        struct sysfs_dirent *sd;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        sd = inode->i_private;
        mutex_lock(&sysfs_mutex);
        sysfs_refresh_inode(sd, inode);
        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e95..3d28af31d86 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
 */
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
@@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e..0630eb969a2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
        return &si->vfs_inode;
 }
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
+static void sysv_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, sysv_i_callback);
+}
 static void init_once(void *p)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd..b427b1208c2 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        /* Truncate the name in place, avoids having to define a compare
           function. */
@@ -47,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
        if (dentry->d_name.len > SYSV_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
        ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3d9c62be0c1..f60c196913e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
        sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
        /* set up enough so that it can read an inode */
        sb->s_op = &sysv_sops;
+        if (sbi->s_forced_ro)
+                sb->s_flags |= MS_RDONLY;
+        if (sbi->s_truncate)
+                sb->s_d_op = &sysv_dentry_operations;
        root_inode = sysv_iget(sb, SYSV_ROOT_INO);
        if (IS_ERR(root_inode)) {
                printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
                printk("SysV FS: get root dentry failed\n");
                return 0;
        }
-        if (sbi->s_forced_ro)
-                sb->s_flags |= MS_RDONLY;
-        if (sbi->s_truncate)
-                sb->s_root->d_op = &sysv_dentry_operations;
        return 1;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e..6e11c2975dc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
        return &ui->vfs_inode;
 };
+static void ubifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ubifs_inode_slab, ui);
+}
 static void ubifs_destroy_inode(struct inode *inode)
 {
        struct ubifs_inode *ui = ubifs_inode(inode);
        kfree(ui->data);
-        kmem_cache_free(ubifs_inode_slab, inode);
+        call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 /*
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4..0e0e99bd6bc 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
 config UDF_FS
        tristate "UDF file system support"
-        depends on BKL # needs serious work to remove
        select CRC_ITU_T
        help
          This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4ce..306ee39ef2c 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
-                        } else {
-                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
+                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf5022..eb8bfe2b89a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *dir = filp->f_path.dentry->d_inode;
        int result;
-        lock_kernel();
        if (filp->f_pos == 0) {
                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        unlock_kernel();
                        return 0;
                }
                filp->f_pos++;
        }
        result = do_udf_readdir(dir, filp, filldir, dirent);
-        unlock_kernel();
        return result;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c..89c78486cbb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_left;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
                        pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        udf_expand_file_adinicb(inode, pos + count, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
+                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = inode->i_size;
                }
        }
+        up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        long old_block, new_block;
        int result = -EINVAL;
-        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
                udf_debug("no permission to access inode %lu\n", inode->i_ino);
                result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        }
 out:
-        unlock_kernel();
        return result;
 }
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
                mutex_lock(&inode->i_mutex);
-                lock_kernel();
+                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
-                unlock_kernel();
+                up_write(&UDF_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
        return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc..6fb7e0adcda 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                return NULL;
        }
-        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
-                struct logicalVolIntegrityDesc *lvid =
+                struct logicalVolIntegrityDescImpUse *lvidiu;
-                        (struct logicalVolIntegrityDesc *)
-                        sbi->s_lvid_bh->b_data;
+                iinfo->i_unique = lvid_get_unique_id(sb);
-                struct logicalVolIntegrityDescImpUse *lvidiu =
+                mutex_lock(&sbi->s_alloc_mutex);
-                                                        udf_sb_lvidiu(sbi);
+                lvidiu = udf_sb_lvidiu(sbi);
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
                else
                        le32_add_cpu(&lvidiu->numFiles, 1);
-                iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
                udf_updated_lvid(sb);
+                mutex_unlock(&sbi->s_alloc_mutex);
        }
-        mutex_unlock(&sbi->s_alloc_mutex);
        inode_init_owner(inode, dir, mode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2d..c6a2e782b97 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                inode->i_size = 0;
                udf_truncate(inode);
-                lock_kernel();
                udf_update_inode(inode, IS_SYNC(inode));
-                unlock_kernel();
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
        if (want_delete) {
-                lock_kernel();
                udf_free_inode(inode);
-                unlock_kernel();
        }
 }
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
        bh = NULL;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (block == iinfo->i_next_alloc_block + 1) {
                iinfo->i_next_alloc_block++;
                iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, phys);
 abort:
-        unlock_kernel();
+        up_write(&iinfo->i_data_sem);
        return err;
 }
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        lock_kernel();
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                down_write(&iinfo->i_data_sem);
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                 inode->i_size)) {
                        udf_expand_file_adinicb(inode, inode->i_size, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                inode->i_size = iinfo->i_lenAlloc;
-                                unlock_kernel();
+                                up_write(&iinfo->i_data_sem);
                                return;
                        } else
                                udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
                                offset - udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = inode->i_size;
                }
+                up_write(&iinfo->i_data_sem);
        } else {
                block_truncate_page(inode->i_mapping, inode->i_size,
                                    udf_get_block);
+                down_write(&iinfo->i_data_sem);
                udf_truncate_extents(inode);
+                up_write(&iinfo->i_data_sem);
        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
-        unlock_kernel();
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                return;
        }
+        read_lock(&sbi->s_cred_lock);
        inode->i_uid = le32_to_cpu(fe->uid);
        if (inode->i_uid == -1 ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
-                inode->i_nlink = 1;
-        inode->i_size = le64_to_cpu(fe->informationLength);
-        iinfo->i_lenExtents = inode->i_size;
        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
+        read_unlock(&sbi->s_cred_lock);
+        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        if (!inode->i_nlink)
+                inode->i_nlink = 1;
+        inode->i_size = le64_to_cpu(fe->informationLength);
+        iinfo->i_lenExtents = inode->i_size;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        int ret;
+        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        lock_kernel();
-        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
-        return ret;
 }
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
        return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        struct extent_position epos = {};
        int ret;
-        lock_kernel();
+        down_read(&UDF_I(inode)->i_data_sem);
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        else
                ret = 0;
-        unlock_kernel();
+        up_read(&UDF_I(inode)->i_data_sem);
        brelse(epos.bh);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baeb..2be0f9eb86d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-                    isdotdot) {
+                    isdotdot)
-                        brelse(epos.bh);
+                        goto out_ok;
-                        return fi;
-                }
                if (!lfi)
                        continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > UDF_NAME_LEN - 2)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                };
                inode = udf_iget(dir->i_sb, lb);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
                                f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
                if (!fibh->ebh)
                        goto out_err;
+                /* Extents could have been merged, invalidate our position */
+                brelse(epos.bh);
+                epos.bh = NULL;
+                epos.block = dinfo->i_location;
+                epos.offset = udf_file_entry_alloc_offset(dir);
                if (!fibh->soffset) {
-                        if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                        /* Find the freshly allocated block */
-                            (EXT_RECORDED_ALLOCATED >> 30)) {
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-                                block = eloc.logicalBlockNum + ((elen - 1) >>
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        block = eloc.logicalBlockNum + ((elen - 1) >>
                                        dir->i_sb->s_blocksize_bits);
-                        } else
-                                block++;
                        brelse(fibh->sbh);
                        fibh->sbh = fibh->ebh;
                        fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
-                unlock_kernel();
                return err;
        }
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        unlock_kernel();
        d_instantiate(dentry, inode);
        return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
        if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -826,7 +813,6 @@ end_rmdir:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -870,7 +855,6 @@ end_unlink:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        int block;
        unsigned char *name = NULL;
        int namelen;
-        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        struct super_block *sb = dir->i_sb;
-        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
+        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
        }
-        iinfo = UDF_I(inode);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                struct kernel_lb_addr eloc;
                uint32_t bsize;
-                block = udf_new_block(inode->i_sb, inode,
+                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-                bsize = inode->i_sb->s_blocksize;
+                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
-                block = udf_get_pblock(inode->i_sb, block,
+                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tgetblk(inode->i_sb, block);
+                epos.bh = udf_tgetblk(sb, block);
                lock_buffer(epos.bh);
-                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        } else
                ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
-        eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;
        if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(inode->i_sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart, name,
                                                   symname - compstart);
                        if (!namelen)
                                goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                lvid->logicalVolContentsUse;
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
+        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 out:
        kfree(name);
-        unlock_kernel();
        return err;
 out_no_entry:
+        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        iput(inode);
        goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct buffer_head *bh;
-        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-                unlock_kernel();
                return -EMLINK;
        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        mark_inode_dirty(inode);
        ihold(inode);
        d_instantiate(dentry, inode);
-        unlock_kernel();
        return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
                        brelse(nfibh.ebh);
                brelse(nfibh.sbh);
        }
-        unlock_kernel();
        return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
-        lock_kernel();
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
                goto out_unlock;
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
-        unlock_kernel();
        return d_obtain_alias(inode);
 out_unlock:
-        unlock_kernel();
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0..a71090ea0e0 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
+        int ret = 0;
+        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                        break;
                                }
-                        if (!st)
+                        if (!st) {
-                                return 1;
+                                ret = 1;
+                                goto out;
+                        }
                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
-                                return 0;
+                                ret = 0;
+                                goto out;
                        }
-                        return 1;
+                        ret = 1;
+                        goto out;
                } /* if old_block */
        }
        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
-                return 1;
+                ret = 1;
        }
-        return 0;
+out:
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return ret;
 }
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836..7b27b063ff6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -135,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
+        init_rwsem(&ei->i_data_sem);
        return &ei->vfs_inode;
 }
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
+static void udf_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, udf_i_callback);
+}
 static void init_once(void *foo)
 {
        struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -567,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
-        lock_kernel();
+        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
+        write_unlock(&sbi->s_cred_lock);
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -590,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                udf_open_lvid(sb);
 out_unlock:
-        unlock_kernel();
        return error;
 }
@@ -959,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                (sizeof(struct buffer_head *) * nr_groups);
        if (size <= PAGE_SIZE)
-                bitmap = kmalloc(size, GFP_KERNEL);
+                bitmap = kzalloc(size, GFP_KERNEL);
        else
-                bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
                udf_error(sb, __func__,
@@ -970,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                return NULL;
        }
-        memset(bitmap, 0x00, size);
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
        return bitmap;
@@ -1774,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1790,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1802,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1822,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
+}
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+        struct buffer_head *bh;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct logicalVolIntegrityDesc *lvid;
+        struct logicalVolHeaderDesc *lvhd;
+        u64 uniqueID;
+        u64 ret;
+        bh = sbi->s_lvid_bh;
+        if (!bh)
+                return 0;
+        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+        mutex_lock(&sbi->s_alloc_mutex);
+        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+        if (!(++uniqueID & 0xFFFFFFFF))
+                uniqueID += 16;
+        lvhd->uniqueID = cpu_to_le64(uniqueID);
+        mutex_unlock(&sbi->s_alloc_mutex);
+        mark_buffer_dirty(bh);
+        return ret;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1879,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
-        lock_kernel();
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
@@ -1889,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = sbi;
@@ -1929,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
+        rwlock_init(&sbi->s_cred_lock);
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
@@ -2038,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-        unlock_kernel();
        return 0;
 error_out:
@@ -2059,7 +2093,6 @@ error_out:
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
        return -EINVAL;
 }
@@ -2098,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
        sbi = UDF_SB(sb);
-        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2115,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2179,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        uint16_t ident;
        struct spaceBitmapDesc *bm;
-        lock_kernel();
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2217,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                }
        }
        brelse(bh);
 out:
-        unlock_kernel();
        return accum;
 }
@@ -2233,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
        int8_t etype;
        struct extent_position epos;
-        lock_kernel();
+        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;
@@ -2243,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
                accum += (elen >> table->i_sb->s_blocksize_bits);
        brelse(epos.bh);
+        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
-        unlock_kernel();
        return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b..b1d4488b0f1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        int err = -EIO;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
+        uint32_t pos;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        pos = udf_block_map(inode, 0);
+        down_read(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
        } else {
-                bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+                bh = sb_bread(inode->i_sb, pos);
                if (!bh)
                        goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
        brelse(bh);
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 out:
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de4107..d1bd31ea724 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
+        struct rw_semaphore     i_data_sem;
        struct inode vfs_inode;
 };
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c276..4858c191242 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC                 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
        uid_t                   s_uid;
        mode_t                  s_fmode;
        mode_t                  s_dmode;
+        /* Lock protecting consistency of above permission settings */
+        rwlock_t                s_cred_lock;
        /* Root Info */
        struct timespec         s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
        __u16                   s_udfrev;
        /* Miscellaneous flags */
-        __u32                   s_flags;
+        unsigned long           s_flags;
        /* Encoding info */
        struct nls_table        *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
-#define UDF_QUERY_FLAG(X,Y)                     ( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
-#define UDF_SET_FLAG(X,Y)                       ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
+{
-#define UDF_CLEAR_FLAG(X,Y)                     ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+        return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+        set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+        clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f430..eba48209f9f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 /* super.c */
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
        sb->s_dirt = 1;
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56d..2c61ac5d4e4 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
+static void ufs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ufs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6ca..faca4499709 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c37081..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3..39f4f809bb6 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd..ec7bbb5645b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -934,9 +847,8 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -949,58 +861,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +878,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1047,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1101,109 +969,78 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
                        uptodate = 0;
                /*
-                 * A hole may still be marked uptodate because discard_buffer
+                 * set_page_dirty dirties all buffers in a page, independent
-                 * leaves the flag set.
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
                 */
                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
+                        }
-                         */
+                } else {
-                        if (!imap_valid || flags != BMAPI_READ) {
+                        if (PageUptodate(page)) {
-                                flags = BMAPI_READ;
+                                ASSERT(buffer_mapped(bh));
-                                size = xfs_probe_cluster(inode, page, bh, head);
+                                imap_valid = 0;
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1232,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1301,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1316,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1391,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1478,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1504,11 +1381,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237..71f721e1a71 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb5..ac1c7e8378d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -488,29 +557,16 @@ found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -840,7 +896,7 @@ xfs_buf_rele(
        trace_xfs_buf_rele(bp, _RET_IP_);
        if (!pag) {
-                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -848,13 +904,15 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
                        spin_unlock(&pag->pag_buf_lock);
-                        bp->b_relse(bp);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -876,10 +934,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -890,6 +956,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
@@ -1441,51 +1509,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1493,17 +1594,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1600,20 +1698,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1630,12 +1721,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1740,27 +1836,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1781,7 +1856,6 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1869,7 @@ xfs_buf_delwri_split(
                                         _XBF_RUN_QUEUES);
                        bp->b_flags |= XBF_WRITE;
                        list_move_tail(&bp->b_list, list);
+                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
                } else
                        skipped++;
        }
@@ -1955,7 +2030,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1971,7 +2045,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf9..cbe65950e52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 00000000000..05201ae719e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 00000000000..e82b6dd3e12
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790e..fc0114da7fd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a16..a55c1b46b21 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +343,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
+        xfs_aio_write_isize_update(inode, ppos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_aio_write_newsize_update(ip);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,247 +629,314 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
+}
-                return -EIO;
-relock:
+/*
-        if (ioflags & IO_ISDIRECT) {
+ * xfs_file_dio_aio_write - handle direct IO writes
-                iolock = XFS_IOLOCK_SHARED;
+ *
-                need_i_mutex = 0;
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-        } else {
+ * By separating it from the buffered write path we remove all the tricky to
-                iolock = XFS_IOLOCK_EXCL;
+ * follow locking changes and looping.
-                need_i_mutex = 1;
+ *
-                mutex_lock(&inode->i_mutex);
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        /*
+         * If we are doing unaligned IO, wait for all other IO to drain,
-start:
+         * otherwise demote the lock if we had to flush cached pages
-        error = -generic_write_checks(file, &pos, &count,
+         */
-                                        S_ISBLK(inode->i_mode));
+        if (unaligned_io)
-        if (error) {
+                xfs_ioend_wait(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                goto out_unlock_mutex;
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                *iolock = XFS_IOLOCK_SHARED;
        }
-        if (ioflags & IO_ISDIRECT) {
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-                xfs_buftarg_t   *target =
+        ret = generic_file_direct_write(iocb, iovp,
-                        XFS_IS_REALTIME_INODE(ip) ?
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+        /* No fallback to buffered IO on errors for XFS. */
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        ASSERT(ret < 0 || ret == count);
-                        return XFS_ERROR(-EINVAL);
+        return ret;
-                }
+}
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+STATIC ssize_t
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+xfs_file_buffered_aio_write(
-                        iolock = XFS_IOLOCK_EXCL;
+        struct kiocb            *iocb,
-                        need_i_mutex = 1;
+        const struct iovec      *iovp,
-                        mutex_lock(&inode->i_mutex);
+        unsigned long           nr_segs,
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        loff_t                  pos,
-                        goto start;
+        size_t                  ocount,
-                }
+        int                     *iolock)
-        }
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        new_size = pos + count;
+        *iolock = XFS_IOLOCK_EXCL;
-        if (new_size > ip->i_size)
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                ip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-                file_update_time(file);
+        if (ret)
+                return ret;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+write_retry:
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                        pos, &iocb->ki_pos, count, ret);
        /*
-         * If the offset is beyond the size of the file, we have a couple
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-         * of things to do. First, if there is already space allocated
+         * page locks and retry *once*
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
         */
+        if (ret == -ENOSPC && !enospc) {
-        if (pos > ip->i_size) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (ret)
-                if (error) {
+                        return ret;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                enospc = 1;
-                        goto out_unlock_internal;
+                goto write_retry;
-                }
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        current->backing_dev_info = NULL;
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If we're writing the file then make sure to clear the
+xfs_file_aio_write(
-         * setuid and setgid bits if the process is not being run
+        struct kiocb            *iocb,
-         * by root.  This keeps people from modifying setuid and
+        const struct iovec      *iovp,
-         * setgid binaries.
+        unsigned long           nr_segs,
-         */
+        loff_t                  pos)
-        error = -file_remove_suid(file);
+{
-        if (unlikely(error))
+        struct file             *file = iocb->ki_filp;
-                goto out_unlock_internal;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-        /* We can write back this queue in page reclaim */
+        XFS_STATS_INC(xs_write_calls);
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+        BUG_ON(iocb->ki_pos != pos);
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        error = xfs_flushinval_pages(ip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        /* demote the lock now the cached pages are gone */
+        if (ret)
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                return ret;
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
+        if (ocount == 0)
-                        need_i_mutex = 0;
+                return 0;
-                }
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                 * direct-io write to a hole: fall through to buffered I/O
+                return -EIO;
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        if (unlikely(file->f_flags & O_DIRECT))
-                        count -= ret;
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-                        ioflags &= ~IO_ISDIRECT;
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                        xfs_iunlock(ip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        if (ret <= 0)
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                goto out_unlock;
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error, error2;
-        isize = i_size_read(inode);
+                xfs_rw_iunlock(ip, iolock);
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                iocb->ki_pos = isize;
+                xfs_rw_ilock(ip, iolock);
-        if (iocb->ki_pos > ip->i_size) {
+                error2 = -xfs_file_fsync(file,
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (iocb->ki_pos > ip->i_size)
+                if (error)
-                        ip->i_size = iocb->ki_pos;
+                        ret = error;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                else if (error2)
+                        ret = error2;
        }
-        error = -ret;
+out_unlock:
-        if (ret <= 0)
+        xfs_aio_write_newsize_update(ip);
-                goto out_unlock_internal;
+        xfs_rw_iunlock(ip, iolock);
+        return ret;
+}
-        XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+        struct file     *file,
+        int             mode,
+        loff_t          offset,
+        loff_t          len)
+{
+        struct inode    *inode = file->f_path.dentry->d_inode;
+        long            error;
+        loff_t          new_size = 0;
+        xfs_flock64_t   bf;
+        xfs_inode_t     *ip = XFS_I(inode);
+        int             cmd = XFS_IOC_RESVSP;
-        /* Handle various SYNC-type writes */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                return -EOPNOTSUPP;
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(ip, iolock);
+        bf.l_whence = 0;
-                if (need_i_mutex)
+        bf.l_start = offset;
-                        mutex_unlock(&inode->i_mutex);
+        bf.l_len = len;
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file,
+        if (mode & FALLOC_FL_PUNCH_HOLE)
-                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                cmd = XFS_IOC_UNRESVSP;
-                if (!error)
-                        error = error2;
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
        }
- out_unlock_internal:
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-        if (ip->i_new_size) {
+        if (error)
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                goto out_unlock;
-                ip->i_new_size = 0;
-                /*
+        /* Change file size if needed */
-                 * If this was a direct or synchronous I/O that failed (such
+        if (new_size) {
-                 * as ENOSPC) then part of the I/O may have been written to
+                struct iattr iattr;
-                 * disk before the error occured.  In this case the on-disk
-                 * file size may have been adjusted beyond the in-memory file
+                iattr.ia_valid = ATTR_SIZE;
-                 * size and now needs to be truncated back.
+                iattr.ia_size = new_size;
-                 */
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
+out_unlock:
-        if (need_i_mutex)
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                mutex_unlock(&inode->i_mutex);
+        return error;
-        return -error;
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
+        .fallocate      = xfs_file_fallocate,
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38..f5e2a19e0f8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
                goto out_dput;
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
                /*
                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
+                 * size, if set at all. It must also be smaller than the
+                 * maximum extent size supported by the filesystem.
+                 *
+                 * Also, for non-realtime files, limit the extent size hint to
+                 * half the size of the AGs in the filesystem so alignment
+                 * doesn't result in extents larger than an AG.
                 */
                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
+                        xfs_extlen_t    size;
+                        xfs_fsblock_t   extsize_fsb;
+                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                        if (extsize_fsb > MAXEXTLEN) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
                        if (XFS_IS_REALTIME_INODE(ip) ||
                            ((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
                                       mp->m_sb.sb_blocklog;
                        } else {
                                size = mp->m_sb.sb_blocksize;
+                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                        code = XFS_ERROR(EINVAL);
+                                        goto error_return;
+                                }
                        }
                        if (fa->fsx_extsize % size) {
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c6..bd5727852fd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
@@ -505,58 +504,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-STATIC long
-xfs_vn_fallocate(
-        struct inode    *inode,
-        int             mode,
-        loff_t          offset,
-        loff_t          len)
-{
-        long            error;
-        loff_t          new_size = 0;
-        xfs_flock64_t   bf;
-        xfs_inode_t     *ip = XFS_I(inode);
-        /* preallocation on directories not yet supported */
-        error = -ENODEV;
-        if (S_ISDIR(inode->i_mode))
-                goto out_error;
-        bf.l_whence = 0;
-        bf.l_start = offset;
-        bf.l_len = len;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        /* check the new inode size is valid before allocating */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-            offset + len > i_size_read(inode)) {
-                new_size = offset + len;
-                error = inode_newsize_ok(inode, new_size);
-                if (error)
-                        goto out_unlock;
-        }
-        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                       0, XFS_ATTR_NOLOCK);
-        if (error)
-                goto out_unlock;
-        /* Change file size if needed */
-        if (new_size) {
-                struct iattr iattr;
-                iattr.ia_valid = ATTR_SIZE;
-                iattr.ia_size = new_size;
-                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-        }
-out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-        return error;
-}
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 /*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
        .getxattr               = generic_getxattr,
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .fallocate              = xfs_vn_fallocate,
        .fiemap                 = xfs_vn_fiemap,
 };
@@ -762,7 +708,8 @@ xfs_setup_inode(
        inode->i_state = I_NEW;
        inode_sb_list_add(inode);
-        insert_inode_hash(inode);
+        /* make the inode look hashed for the writeback code */
+        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff7..09649499774 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae..9731898083a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                        cmn_err(CE_WARN,
-                                "Enabling EXPERIMENTAL delayed logging feature "
-                                "- use at your own risk.\n");
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
@@ -609,7 +606,8 @@ xfs_blkdev_get(
 {
        int                     error = 0;
-        *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+        *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
                printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -623,7 +621,7 @@ xfs_blkdev_put(
        struct block_device     *bdev)
 {
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 /*
@@ -837,8 +835,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -850,8 +851,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -938,7 +948,7 @@ out_reclaim:
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
@@ -1121,6 +1131,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
@@ -1402,7 +1414,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981..e22f0057d21 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                xfs_reclaim_inodes(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +834,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -853,6 +892,7 @@ restart:
                if (trylock) {
                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
                                skipped++;
+                                xfs_perag_put(pag);
                                continue;
                        }
                        first_index = pag->pag_ici_reclaim_cursor;
@@ -863,14 +903,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -890,14 +930,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae..ee3cee097e7 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c59..2d0bcb47907 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a1..d22aa310310 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde..206a2815ced 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
        xfs_dquot_t     *dqpout;
        xfs_dquot_t     *dqp;
        int             restarts;
+        int             startagain;
        restarts = 0;
        dqpout = NULL;
        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+        startagain = 0;
        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
+                        restarts++;
+                        startagain = 1;
+                        goto dqunlock;
                }
                /*
@@ -1906,23 +1905,20 @@ startagain:
                        ASSERT(list_empty(&dqp->q_mplist));
                        list_del_init(&dqp->q_freelist);
                        xfs_Gqm->qm_dqfrlist_cnt--;
-                        xfs_dqunlock(dqp);
                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
+                        goto dqunlock;
                }
                ASSERT(dqp->q_hash);
                ASSERT(!list_empty(&dqp->q_mplist));
                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
+                 * Try to grab the flush lock. If this dquot is in the process
-                 * getting flushed to disk, we don't want to reclaim it.
+                 * of getting flushed to disk, we don't want to reclaim it.
                 */
-                if (!xfs_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp))
-                        xfs_dqunlock(dqp);
+                        goto dqunlock;
-                        continue;
-                }
                /*
                 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
                                xfs_fs_cmn_err(CE_WARN, mp,
                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        goto dqunlock;
-                        continue;
                }
                /*
@@ -1967,13 +1962,8 @@ startagain:
                 */
                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
                        restarts++;
-                        mutex_unlock(&dqp->q_hash->qh_lock);
+                        startagain = 1;
-                        xfs_dqfunlock(dqp);
+                        goto qhunlock;
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        goto startagain;
                }
                ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
                xfs_Gqm->qm_dqfrlist_cnt--;
                dqpout = dqp;
                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
                mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
                xfs_dqfunlock(dqp);
+dqunlock:
                xfs_dqunlock(dqp);
                if (dqpout)
                        break;
                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        return NULL;
+                        break;
+                if (startagain) {
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                        goto again;
+                }
        }
        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        return dqpout;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a4..0df88897ef8 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
-static char             message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL       7
-#define XFS_ERR_MASK            ((1 << 3) - 1)
-static const char * const       err_level[XFS_MAX_ERR_LEVEL+1] =
-                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                         KERN_INFO, KERN_DEBUG};
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+        const char      *lvl,
+        const char      *fmt,
+        ...)
 {
-        char    *fp = fmt;
+        struct va_format vaf;
-        int     len;
+        va_list         args;
-        ulong   flags;
-        va_list ap;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
-        level &= XFS_ERR_MASK;
+        vaf.va = &args;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
+        printk("%s%pV", lvl, &vaf);
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        va_end(args);
-        va_start(ap, fmt);
-        if (*fmt == '!') fp++;
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-        len = vsnprintf(message, sizeof(message), fp, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
-        va_end(ap);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
 }
 void
-xfs_fs_vcmn_err(
+xfs_fs_cmn_err(
-        int                     level,
+        const char              *lvl,
        struct xfs_mount        *mp,
-        char                    *fmt,
+        const char              *fmt,
-        va_list                 ap)
+        ...)
 {
-        unsigned long           flags;
+        struct va_format        vaf;
-        int                     len = 0;
+        va_list                 args;
-        level &= XFS_ERR_MASK;
+        va_start(args, fmt);
-        if (level > XFS_MAX_ERR_LEVEL)
+        vaf.fmt = fmt;
-                level = XFS_MAX_ERR_LEVEL;
+        vaf.va = &args;
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+        va_end(args);
-        if (mp) {
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+}
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+        int                     panic_tag,
+        const char              *lvl,
+        struct xfs_mount        *mp,
+        const char              *fmt,
+        ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
-                /*
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                 * Skip the printk if we can't print anything useful
+                printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-                 * due to an over-long device name.
+                do_panic = 1;
-                 */
-                if (len >= sizeof(message))
-                        goto out;
        }
-        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
+        va_start(args, fmt);
-        if (len >= sizeof(message))
+        vaf.fmt = fmt;
-                len = sizeof(message) - 1;
+        vaf.va = &args;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
+        printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
- out:
+        va_end(args);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
+        BUG_ON(do_panic);
 }
 void
 assfail(char *expr, char *file, int line)
 {
-        printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+        printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+               file, line);
        BUG();
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4..05699f67d47 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
 #include <stdarg.h>
-#define CE_DEBUG        7               /* debug        */
+struct xfs_mount;
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
+#define CE_DEBUG        KERN_DEBUG
-#define CE_WARN         4               /* warning      */
+#define CE_CONT         KERN_INFO
-#define CE_ALERT        1               /* alert        */
+#define CE_NOTE         KERN_NOTICE
-#define CE_PANIC        0               /* panic        */
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
-extern void cmn_err(int, char *, ...)
+#define CE_PANIC        KERN_EMERG
-        __attribute__ ((format (printf, 2, 3)));
+void cmn_err(const char *lvl, const char *fmt, ...)
+                __attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
 extern void assfail(char *expr, char *f, int l);
 #define ASSERT_ALWAYS(expr)     \
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d..11dd72070cb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c02..58632cc17f2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca..f3227984a9b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agblock_t bno, xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
 * Get the data from the pointed-to record.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +661,95 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                          args->minlen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
@@ -2676,7 +2611,7 @@ restart:
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a9727..d0b3bc72005 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define __XFS_ALLOC_H__
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 /*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 #ifdef __KERNEL__
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-                xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
 #endif  /* __KERNEL__ */
 /*
@@ -205,4 +222,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb..71e90dc2aeb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..dc3afd7739f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the middle part of a previous delayed allocation.
                 * Contiguity is impossible here.
                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_bmbt_set_blockcount(ep, temp);
-                r[0] = *new;
-                r[1].br_state = PREV.br_state;
-                r[1].br_startblock = 0;
-                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                r[1].br_blockcount = temp2;
+                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
                startag = ag = 0;
        pag = xfs_perag_get(mp, ag);
-        while (*blen < ap->alen) {
+        while (*blen < args->maxlen) {
                if (!pag->pagf_init) {
                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
                                                    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
                        notinit = 1;
                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= ap->alen)
+                        if (*blen >= args->maxlen)
                                break;
                        if (ap->userdata) {
@@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
         * If the best seen length is less than the request
         * length, use the best as the minimum.
         */
-        else if (*blen < ap->alen)
+        else if (*blen < args->maxlen)
                args->minlen = *blen;
        /*
-         * Otherwise we've seen an extent as big as alen,
+         * Otherwise we've seen an extent as big as maxlen,
         * use that as the minimum.
         */
        else
-                args->minlen = ap->alen;
+                args->minlen = args->maxlen;
        /*
         * set the failure fallback case to look in the selected
@@ -2573,7 +2590,9 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->rval;
-        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -2621,7 +2640,7 @@ xfs_bmap_btalloc(
                        /*
                         * Adjust for alignment
                         */
-                        if (blen > args.alignment && blen <= ap->alen)
+                        if (blen > args.alignment && blen <= args.maxlen)
                                args.minlen = blen - args.alignment;
                        args.minalignslop = 0;
                } else {
@@ -2640,7 +2659,7 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= ap->alen)
+                        if (blen > mp->m_dalign && blen <= args.maxlen)
                                nextminlen = blen - mp->m_dalign;
                        else
                                nextminlen = args.minlen;
@@ -4485,6 +4504,16 @@ xfs_bmapi(
                                /* Figure out the extent size, adjust alen */
                                extsz = xfs_get_extsz_hint(ip);
                                if (extsz) {
+                                        /*
+                                         * make sure we don't exceed a single
+                                         * extent length when we align the
+                                         * extent by reducing length we are
+                                         * going to allocate by the maximum
+                                         * amount extent size aligment may
+                                         * require.
+                                         */
+                                        alen = XFS_FILBLKS_MIN(len,
+                                                   MAXEXTLEN - (2 * extsz - 1));
                                        error = xfs_bmap_extsize_align(mp,
                                                        &got, &prev, extsz,
                                                        rt, eof,
@@ -5471,8 +5500,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6104,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7..2f9e97c128a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5..6f8c21ce0d6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
 #define         xfs_buf_item_log_check(x)
 #endif
-STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 /*
 * This returns the number of log iovecs needed to log the
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
                if (remove) {
                        /*
-                         * We have to remove the log item from the transaction
+                         * If we are in a transaction context, we have to
-                         * as we are about to release our reference to the
+                         * remove the log item from the transaction as we are
-                         * buffer.  If we don't, the unlock that occurs later
+                         * about to release our reference to the buffer.  If we
-                         * in xfs_trans_uncommit() will ry to reference the
+                         * don't, the unlock that occurs later in
+                         * xfs_trans_uncommit() will try to reference the
                         * buffer which we no longer have a hold on.
                         */
-                        xfs_trans_del_item(lip);
+                        if (lip->li_desc)
+                                xfs_trans_del_item(lip);
                        /*
                         * Since the transaction no longer refers to the buffer,
@@ -450,7 +451,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +919,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +948,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -949,128 +960,76 @@ xfs_buf_do_callbacks(
 */
 void
 xfs_buf_iodone_callbacks(
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_log_item_t  *lip;
+        struct xfs_log_item     *lip = bp->b_fspriv;
-        static ulong    lasttime;
+        struct xfs_mount        *mp = lip->li_mountp;
-        static xfs_buftarg_t *lasttarg;
+        static ulong            lasttime;
-        xfs_mount_t     *mp;
+        static xfs_buftarg_t    *lasttarg;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        if (likely(!XFS_BUF_GETERROR(bp)))
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                goto do_callbacks;
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        /*
-                /*
+         * If we've already decided to shutdown the filesystem because of
-                 * If we've already decided to shutdown the filesystem
+         * I/O errors, there's no point in giving this a retry.
-                 * because of IO errors, there's no point in giving this
+         */
-                 * a retry.
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                 */
+                XFS_BUF_SUPER_STALE(bp);
-                mp = lip->li_mountp;
+                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                goto do_callbacks;
-                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        }
-                        XFS_BUF_SUPER_STALE(bp);
-                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
-                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_buf_ioend(bp, 0);
-                        return;
-                }
-                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+        if (XFS_BUF_TARGET(bp) != lasttarg ||
-                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+            time_after(jiffies, (lasttime + 5*HZ))) {
-                        lasttime = jiffies;
+                lasttime = jiffies;
-                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-                                        " block 0x%llx in %s",
+                                " block 0x%llx in %s",
-                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-                }
+        }
-                lasttarg = XFS_BUF_TARGET(bp);
+        lasttarg = XFS_BUF_TARGET(bp);
-                if (XFS_BUF_ISASYNC(bp)) {
+        /*
-                        /*
+         * If the write was asynchronous then noone will be looking for the
-                         * If the write was asynchronous then noone will be
+         * error.  Clear the error state and write the buffer out again.
-                         * looking for the error.  Clear the error state
+         *
-                         * and write the buffer out again delayed write.
+         * During sync or umount we'll write all pending buffers again
-                         *
+         * synchronous, which will catch these errors if they keep hanging
-                         * XXXsup This is OK, so long as we catch these
+         * around.
-                         * before we start the umount; we don't want these
+         */
-                         * DELWRI metadata bufs to be hanging around.
+        if (XFS_BUF_ISASYNC(bp)) {
-                         */
+                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
-                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                if (!XFS_BUF_ISSTALE(bp)) {
-                        if (!(XFS_BUF_ISSTALE(bp))) {
+                        XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DONE(bp);
-                                XFS_BUF_SET_START(bp);
-                        }
-                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                        xfs_buf_relse(bp);
-                } else {
-                        /*
-                         * If the write of the buffer was not asynchronous,
-                         * then we want to make sure to return the error
-                         * to the caller of bwrite().  Because of this we
-                         * cannot clear the B_ERROR state at this point.
-                         * Instead we install a callback function that
-                         * will be called when the buffer is released, and
-                         * that routine will clear the error state and
-                         * set the buffer to be written out again after
-                         * some delay.
-                         */
-                        /* We actually overwrite the existing b-relse
-                           function at times, but we're gonna be shutting down
-                           anyway. */
-                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_FINISH_IOWAIT(bp);
+                        XFS_BUF_SET_START(bp);
                }
+                ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+                xfs_buf_relse(bp);
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        /*
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+         * If the write of the buffer was synchronous, we want to make
-        XFS_BUF_CLR_IODONE_FUNC(bp);
+         * sure to return the error to the caller of xfs_bwrite().
-        xfs_buf_ioend(bp, 0);
+         */
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-        xfs_buf_t       *bp)
-{
-        xfs_log_item_t  *lip;
-        xfs_mount_t     *mp;
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-        mp = (xfs_mount_t *)lip->li_mountp;
-        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_ERROR(bp,0);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-        if (! XFS_FORCED_SHUTDOWN(mp))
+do_callbacks:
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_do_callbacks(bp);
-        /*
-         * We have to unpin the pinned buffers so do the
-         * callbacks.
-         */
-        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_ioend(bp, 0);
-        xfs_buf_relse(bp);
 }
 /*
 * This is the iodone() function for buffers which have been
 * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c..b6ecd2061e7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..4c7db74a05f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
@@ -149,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-#ifdef DEBUG
-        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-            && (level & CE_ALERT)) {
-                level &= ~CE_ALERT;
-                level |= CE_PANIC;
-                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-        }
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
 void
 xfs_error_report(
        const char              *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..10dce5475f0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
@@ -161,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-                char *fmt, va_list ap)
-        __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...)
-        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+        do { \
+                if (!(f & XFS_MFSI_QUIET))      \
+                        cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+        } while (0)
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf56..d22e6262343 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
+        if (remove) {
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-                if (remove)
+                if (lip->li_desc)
                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +254,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf6..375f68e4253 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce569..9124425b7f2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
         * If the file's parent directory is known, take its iolock in exclusive
         * mode to prevent two sibling files from racing each other to migrate
         * themselves and their parent to different AGs.
+         *
+         * Note that we lock the parent directory iolock inside the child
+         * iolock here.  That's fine as we never hold both parent and child
+         * iolock in any other place.  This is different from the ilock,
+         * which requires locking of the child after the parent for namespace
+         * operations.
         */
        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL);
+                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        /*
         * A new AG needs to be found for the file.  If the file's parent
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814a..cec89dd5d7d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +612,13 @@ out:
 *
 * We cannot use an inode here for this - that will push dirty state back up
 * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
 */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             flags)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-        if (flags & SYNC_WAIT)
+        xfs_trans_set_sync(tp);
-                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1..1b6a98b6688 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8..cb9b6d1469f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -144,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -366,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f9..be7cf625421 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc..5c95fa8ec11 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..fd4f398bd6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
@@ -823,15 +842,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -842,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369..8a0f044750c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                /*
+                 * rounddown_pow_of_two() returns an undefined result
+                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                 * ensure we always pass in a non-zero value.
+                 */
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +431,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -523,8 +467,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +480,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +619,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50..80615760959 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9..ae6fef1ff56 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
        else {
                cmn_err(CE_NOTE,
-                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+                        "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2527,6 +2486,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2638,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2646,17 +2604,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2778,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+        struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        int             tail_cycle, tail_blocks;
-        if (equals)
+        int             cycle, space;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-        else
+        /*
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+         * Check to make sure the grant write head didn't just over lap the
-    } else {
+         * tail.  If the cycles are the same, we can't be overlapping.
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+         * Otherwise, make sure that the cycles differ by exactly one and
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+         * check the byte count.
-    }
+         */
-}       /* xlog_verify_grant_head */
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+        if (tail_cycle != cycle) {
+                ASSERT(cycle - 1 == tail_cycle);
+                ASSERT(space <= BBTOB(tail_blocks));
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d..3bd3291ef8d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97..9ca59be0897 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -548,7 +543,7 @@ xlog_cil_push(
        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
        if (error)
-                goto out_abort;
+                goto out_abort_free_ticket;
        /*
         * now that we've written the checkpoint into the log, strictly
@@ -568,14 +563,15 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
        spin_unlock(&cil->xc_cil_lock);
+        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-        if (error || commit_lsn == -1)
+        if (commit_lsn == -1)
                goto out_abort;
        /* attach all the transactions w/ busy extents to iclog */
@@ -592,7 +588,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -605,6 +601,8 @@ out_free_ticket:
        kmem_free(new_ctx);
        return 0;
+out_abort_free_ticket:
+        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
        return XFS_ERROR(EIO);
@@ -627,7 +625,7 @@ out_abort:
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -642,11 +640,6 @@ xfs_log_commit_cil(
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
        /*
         * do all the hard work of formatting items (including memory
         * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -706,7 +699,6 @@ xfs_log_commit_cil(
         */
        if (push)
                xlog_cil_push(log, 0);
-        return 0;
 }
 /*
@@ -757,7 +749,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617..d5f8be8f4bf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458..aa0ebb77690 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                        item, dip, bp, ino,
+                        item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                        item, dip, bp, ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2422,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
                cmn_err(CE_DEBUG,
-                        "!Ending clean XFS mount for filesystem: %s\n",
+                        "Ending clean XFS mount for filesystem: %s\n",
                        log->l_mp->m_fsname);
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a39..d447aef84bc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
                pag = radix_tree_delete(&mp->m_perag_tree, agno);
                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
+                ASSERT(atomic_read(&pag->pag_ref) == 0);
                call_rcu(&pag->rcu_head, __xfs_free_perag);
        }
 }
@@ -471,7 +472,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -974,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -1195,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b498074..a62e8971539 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2..edfa178bafb 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd..9bb6eda4cd2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
+                struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+        return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+                struct xfs_mount *mp, struct xfs_dquot *udqp,
+                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+        return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)                                             (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+        return 0;
+}
 #define xfs_qm_newmount(mp, a, b)                                       (0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)                                       (0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
         * it and some incremental backup programs won't work without it.
         */
        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711..76922793f64 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
        if (blkdelta)
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-        ASSERT(error = 0);
+        ASSERT(error == 0);
        return;
 }
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,21 +1425,120 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
 /*
- * Called from the trans_commit code when we notice that
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
- * the filesystem is in the middle of a forced shutdown.
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                /*
+                 * if we are aborting the operation, no point in inserting the
+                 * object into the AIL as we are in a shutdown situation.
+                 */
+                if (aborted) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+                        IOP_UNPIN(lip, 1);
+                        continue;
+                }
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
+/*
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
 */
 STATIC void
 xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        struct xfs_log_item_desc *lidp;
+        struct xfs_log_item_desc *lidp, *n;
-        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+        list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
                        IOP_UNPIN(lidp->lid_item, 1);
        }
@@ -1656,7 +1755,6 @@ xfs_trans_commit_cil(
        int                     flags)
 {
        struct xfs_log_vec      *log_vector;
-        int                     error;
        /*
         * Get each log item to allocate a vector structure for
@@ -1667,9 +1765,7 @@ xfs_trans_commit_cil(
        if (!log_vector)
                return ENOMEM;
-        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        if (error)
-                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a8..c2042b736b8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff..c5bbbc45db9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * This function takes the AIL lock once to execute the update operations on
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * all the items in the array, and as such should not be called with the AIL
- * log manager.
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * To optimise the insert operation, we delete all the items from the AIL in
- * is dropped before returning.
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * removed from the AIL. The caller is already holding the AIL lock, and done
- * log manager.
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * bump the AIL's generation count to indicate that the tree
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * has changed.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function will not drop the AIL lock until all items are removed from
- * is dropped before returning.
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
-                lip->li_flags &= ~XFS_LI_IN_AIL;
+        for (i = 0; i < nr_items; i++) {
-                lip->li_lsn = 0;
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 /*
- * Insert the given log item into the AIL.
+ * splice the log item list into the AIL at the given LSN.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
 */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
+        struct list_head *list,
-/* ARGSUSED */
+        xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
+                list_splice(list, &ailp->xa_ail);
                return;
        }
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
+        list_splice_init(list, &next_lip->li_ail);
        return;
 }
 /*
 * Delete the given item from the AIL.  Return a pointer to the item.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC xfs_log_item_t *
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
-        return lip;
 }
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa7..f7590f5bade 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de..35162c238fa 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+static inline void
-                                        struct xfs_log_item *lip)
+xfs_trans_ail_update(
-                                        __releases(ailp->xa_lock);
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
 void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151..d8e6f8cd6f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
-                         */
-                        error = xfs_free_eofblocks(mp, ip,
-                                                   XFS_FREE_EOF_TRYLOCK);
-                        if (error)
-                                return error;
-                }
-        }
+                /*
+                 * If we can't get the iolock just skip truncating the blocks
+                 * past EOF because we could deadlock with the mmap_sem
+                 * otherwise.  We'll get another chance to drop them once the
+                 * last reference to the inode is dropped, so we'll never leak
+                 * blocks permanently.
+                 *
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * oustanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2011-03-19 02:38:50 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2011-03-19 02:38:50 -0400
commit	97eb3f24352ec6632c2127b35d8087d2a809a9b9 (patch)
tree	722948059bbd325bbca232269490124231df80d4 /fs
parent	439581ec07fa9cf3f519dd461a2cf41cfd3adcb4 (diff)
parent	def179c271ac9b5020deca798470521f14d11edd (diff)